diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 819e3974e3f1330..3ec7e49236101c4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -241,7 +241,7 @@ Clang-Tidy Checks
    :doc:`llvmlibc-restrict-system-libc-headers <llvmlibc/restrict-system-libc-headers>`, "Yes"
    :doc:`misc-confusable-identifiers <misc/confusable-identifiers>`,
    :doc:`misc-const-correctness <misc/const-correctness>`, "Yes"
-   :doc:`misc-coroutine-hostile-raii <misc/coroutine-hostile-raii.html>`_,
+   :doc:`misc-coroutine-hostile-raii <misc/coroutine-hostile-raii>`,
    :doc:`misc-definitions-in-headers <misc/definitions-in-headers>`, "Yes"
    :doc:`misc-header-include-cycle <misc/header-include-cycle>`,
    :doc:`misc-include-cleaner <misc/include-cleaner>`, "Yes"
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
index dcb9f399774cba9..b8698ba3de85300 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
@@ -1,28 +1,28 @@
 .. title:: clang-tidy - misc-coroutine-hostile-raii
 
 misc-coroutine-hostile-raii
-====================
+===========================
 
 Detects when objects of certain hostile RAII types persists across suspension
 points in a coroutine. Such hostile types include scoped-lockable types and
 types belonging to a configurable denylist.
 
-Some objects require that they be destroyed on the same thread that created them. 
+Some objects require that they be destroyed on the same thread that created them.
 Traditionally this requirement was often phrased as "must be a local variable",
 under the assumption that local variables always work this way. However this is
 incorrect with C++20 coroutines, since an intervening ``co_await`` may cause the
 coroutine to suspend and later be resumed on another thread.
 
-The lifetime of an object that requires being destroyed on the same thread must 
+The lifetime of an object that requires being destroyed on the same thread must
 not encompass a ``co_await`` or ``co_yield`` point. If you create/destroy an object,
 you must do so without allowing the coroutine to suspend in the meantime.
 
 Following types are considered as hostile:
 
  - Scoped-lockable types: A scoped-lockable object persisting across a suspension
- point is problematic as the lock held by this object could be unlocked by a 
- different thread. This would be undefined behaviour.
- This includes all types annotated with the ``scoped_lockable`` attribute.
+   point is problematic as the lock held by this object could be unlocked by a
+   different thread. This would be undefined behaviour.
+   This includes all types annotated with the ``scoped_lockable`` attribute.
 
  - Types belonging to a configurable denylist.
 
@@ -44,7 +44,7 @@ Options
 
 .. option:: RAIITypesList
 
-    A semicolon-separated list of qualified types which should not be allowed to 
+    A semicolon-separated list of qualified types which should not be allowed to
     persist across suspension points.
     Eg: ``my::lockable; a::b;::my::other::lockable;``
-    The default value of this option is `"std::lock_guard;std::scoped_lock"`.
\ No newline at end of file
+    The default value of this option is `"std::lock_guard;std::scoped_lock"`.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 443325bb0d1e17d..1e77386aede2e5d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -117,8 +117,6 @@ C++ Language Changes
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Fix a bug in conversion sequence of arguments to a function with reversed parameter order.
-  Fixes `GH <https://github.com/llvm/llvm-project/issues/53954>`_.
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -199,6 +197,9 @@ New Compiler Flags
   the preprocessed text to the output. This can greatly reduce the size of the
   preprocessed output, which can be helpful when trying to reduce a test case.
 
+* ``-Wbitfield-conversion`` was added to detect assignments of integral
+  types to a bitfield that may change the value.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -522,6 +523,10 @@ Bug Fixes to C++ Support
   with non-type template parameters of reference type. Fixes:
   (`#65153 <https://github.com/llvm/llvm-project/issues/65153>`_)
 
+- Clang now properly compares constraints on an out of line class template
+  declaration definition. Fixes:
+  (`#61763 <https://github.com/llvm/llvm-project/issues/61763>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index b69c616b0090365..638f886edd095f1 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -607,6 +607,13 @@ class Expr : public ValueStmt {
     /// foldable. If the expression is foldable, but not a constant expression,
     /// the notes will describes why it isn't a constant expression. If the
     /// expression *is* a constant expression, no notes will be produced.
+    ///
+    /// FIXME: this causes significant performance concerns and should be
+    /// refactored at some point. Not all evaluations of the constant
+    /// expression interpreter will display the given diagnostics, this means
+    /// those kinds of uses are paying the expense of generating a diagnostic
+    /// (which may include expensive operations like converting APValue objects
+    /// to a string representation).
     SmallVectorImpl<PartialDiagnosticAt> *Diag = nullptr;
 
     EvalStatus() = default;
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 0b09c002191848a..674eb9f4ef2e73f 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -53,6 +53,7 @@ def SingleBitBitFieldConstantConversion :
 def BitFieldConstantConversion : DiagGroup<"bitfield-constant-conversion",
                                            [SingleBitBitFieldConstantConversion]>;
 def BitFieldEnumConversion : DiagGroup<"bitfield-enum-conversion">;
+def BitFieldConversion : DiagGroup<"bitfield-conversion">;
 def BitFieldWidth : DiagGroup<"bitfield-width">;
 def CompoundTokenSplitByMacro : DiagGroup<"compound-token-split-by-macro">;
 def CompoundTokenSplitBySpace : DiagGroup<"compound-token-split-by-space">;
@@ -933,6 +934,7 @@ def Conversion : DiagGroup<"conversion",
                             ConstantConversion,
                             EnumConversion,
                             BitFieldEnumConversion,
+                            BitFieldConversion,
                             FloatConversion,
                             Shorten64To32,
                             IntConversion,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 58a33e2807b7b0a..7f39f5e79792c07 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6171,6 +6171,9 @@ def warn_signed_bitfield_enum_conversion : Warning<
   "signed bit-field %0 needs an extra bit to represent the largest positive "
   "enumerators of %1">,
   InGroup<BitFieldEnumConversion>, DefaultIgnore;
+def warn_bitfield_too_small_for_integral_type : Warning<
+  "conversion from %2 (%3 bits) to bit-field %0 (%1 bits) may change value">,
+  InGroup<BitFieldConversion>, DefaultIgnore;
 def note_change_bitfield_sign : Note<
   "consider making the bitfield type %select{unsigned|signed}0">;
 
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index f54e65ef7119cc1..25a28052ed0d97f 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1865,10 +1865,21 @@ def SVPTRUE_COUNT  : SInst<"svptrue_{d}", "}v", "QcQsQiQl", MergeNone, "aarch64_
 
 def SVPEXT_SINGLE : SInst<"svpext_lane_{d}", "P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext", [], [ImmCheck<1, ImmCheck0_3>]>;
 def SVPEXT_X2     : SInst<"svpext_lane_{d}_x2", "2.P}i", "QcQsQiQl", MergeNone, "aarch64_sve_pext_x2", [], [ImmCheck<1, ImmCheck0_1>]>;
+
+def SVPSEL_COUNT_ALIAS_B : SInst<"svpsel_lane_c8",  "}}Pm", "Pc", MergeNone, "", [], []>;
+def SVPSEL_COUNT_ALIAS_H : SInst<"svpsel_lane_c16", "}}Pm", "Ps", MergeNone, "", [], []>;
+def SVPSEL_COUNT_ALIAS_S : SInst<"svpsel_lane_c32", "}}Pm", "Pi", MergeNone, "", [], []>;
+def SVPSEL_COUNT_ALIAS_D : SInst<"svpsel_lane_c64", "}}Pm", "Pl", MergeNone, "", [], []>;
 }
 
 let TargetGuard = "sve2p1" in {
 def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [], []>;
 def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>;
+
+def SVPSEL_B : SInst<"svpsel_lane_b8",  "PPPm", "Pc", MergeNone, "", [], []>;
+def SVPSEL_H : SInst<"svpsel_lane_b16", "PPPm", "Ps", MergeNone, "", [], []>;
+def SVPSEL_S : SInst<"svpsel_lane_b32", "PPPm", "Pi", MergeNone, "", [], []>;
+def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPm", "Pl", MergeNone, "", [], []>;
+
 def SVCNTP_COUNT : SInst<"svcntp_{d}", "n}i", "QcQsQiQl", MergeNone, "aarch64_sve_cntp_{d}", [IsOverloadNone], [ImmCheck<1, ImmCheck2_4_Mul2>]>;
 }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 250ac33680cdbc7..73fee208cbef17e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3809,17 +3809,6 @@ class Sema final {
   // the purposes of [temp.friend] p9.
   bool FriendConstraintsDependOnEnclosingTemplate(const FunctionDecl *FD);
 
-  // Calculates whether two constraint expressions are equal irrespective of a
-  // difference in 'depth'. This takes a pair of optional 'NamedDecl's 'Old' and
-  // 'New', which are the "source" of the constraint, since this is necessary
-  // for figuring out the relative 'depth' of the constraint. The depth of the
-  // 'primary template' and the 'instantiated from' templates aren't necessarily
-  // the same, such as a case when one is a 'friend' defined in a class.
-  bool AreConstraintExpressionsEqual(const NamedDecl *Old,
-                                     const Expr *OldConstr,
-                                     const NamedDecl *New,
-                                     const Expr *NewConstr);
-
   enum class AllowedExplicit {
     /// Allow no explicit functions to be used.
     None,
@@ -8615,8 +8604,48 @@ class Sema final {
     TPL_TemplateParamsEquivalent,
   };
 
+  // A struct to represent the 'new' declaration, which is either itself just
+  // the named decl, or the important information we need about it in order to
+  // do constraint comparisons.
+  class TemplateCompareNewDeclInfo {
+    const NamedDecl *ND = nullptr;
+    const DeclContext *DC = nullptr;
+    const DeclContext *LexicalDC = nullptr;
+    SourceLocation Loc;
+
+  public:
+    TemplateCompareNewDeclInfo(const NamedDecl *ND) : ND(ND) {}
+    TemplateCompareNewDeclInfo(const DeclContext *DeclCtx,
+                               const DeclContext *LexicalDeclCtx,
+                               SourceLocation Loc)
+
+        : DC(DeclCtx), LexicalDC(LexicalDeclCtx), Loc(Loc) {
+      assert(DC && LexicalDC &&
+             "Constructor only for cases where we have the information to put "
+             "in here");
+    }
+
+    // If this was constructed with no information, we cannot do substitution
+    // for constraint comparison, so make sure we can check that.
+    bool isInvalid() const { return !ND && !DC; }
+
+    const NamedDecl *getDecl() const { return ND; }
+
+    bool ContainsDecl(const NamedDecl *ND) const { return this->ND == ND; }
+
+    const DeclContext *getLexicalDeclContext() const {
+      return ND ? ND->getLexicalDeclContext() : LexicalDC;
+    }
+
+    const DeclContext *getDeclContext() const {
+      return ND ? ND->getDeclContext() : DC;
+    }
+
+    SourceLocation getLocation() const { return ND ? ND->getLocation() : Loc; }
+  };
+
   bool TemplateParameterListsAreEqual(
-      const NamedDecl *NewInstFrom, TemplateParameterList *New,
+      const TemplateCompareNewDeclInfo &NewInstFrom, TemplateParameterList *New,
       const NamedDecl *OldInstFrom, TemplateParameterList *Old, bool Complain,
       TemplateParameterListEqualKind Kind,
       SourceLocation TemplateArgLoc = SourceLocation());
@@ -8629,6 +8658,17 @@ class Sema final {
                                           Kind, TemplateArgLoc);
   }
 
+  // Calculates whether two constraint expressions are equal irrespective of a
+  // difference in 'depth'. This takes a pair of optional 'NamedDecl's 'Old' and
+  // 'New', which are the "source" of the constraint, since this is necessary
+  // for figuring out the relative 'depth' of the constraint. The depth of the
+  // 'primary template' and the 'instantiated from' templates aren't necessarily
+  // the same, such as a case when one is a 'friend' defined in a class.
+  bool AreConstraintExpressionsEqual(const NamedDecl *Old,
+                                     const Expr *OldConstr,
+                                     const TemplateCompareNewDeclInfo &New,
+                                     const Expr *NewConstr);
+
   bool CheckTemplateDeclScope(Scope *S, TemplateParameterList *TemplateParams);
 
   /// Called when the parser has parsed a C++ typename
@@ -9368,13 +9408,12 @@ class Sema final {
   // C++ Template Instantiation
   //
 
-  MultiLevelTemplateArgumentList
-  getTemplateInstantiationArgs(const NamedDecl *D, bool Final = false,
-                               const TemplateArgumentList *Innermost = nullptr,
-                               bool RelativeToPrimary = false,
-                               const FunctionDecl *Pattern = nullptr,
-                               bool ForConstraintInstantiation = false,
-                               bool SkipForSpecialization = false);
+  MultiLevelTemplateArgumentList getTemplateInstantiationArgs(
+      const NamedDecl *D, const DeclContext *DC = nullptr, bool Final = false,
+      const TemplateArgumentList *Innermost = nullptr,
+      bool RelativeToPrimary = false, const FunctionDecl *Pattern = nullptr,
+      bool ForConstraintInstantiation = false,
+      bool SkipForSpecialization = false);
 
   /// A context in which code is being synthesized (where a source location
   /// alone is not sufficient to identify the context). This covers template
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 28d603bf115950a..2a553054a0ce51c 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -213,7 +213,9 @@ enum class TemplateSubstitutionKind : char {
              "substituted args outside retained args?");
       assert(getKind() == TemplateSubstitutionKind::Specialization);
       TemplateArgumentLists.push_back(
-          {{AssociatedDecl->getCanonicalDecl(), Final}, Args});
+          {{AssociatedDecl ? AssociatedDecl->getCanonicalDecl() : nullptr,
+            Final},
+           Args});
     }
 
     void addOuterTemplateArguments(ArgList Args) {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 116af1435fe6e40..3602c6564893d0a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10007,7 +10007,33 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   switch (BuiltinID) {
   default:
     return nullptr;
-
+  case SVE::BI__builtin_sve_svpsel_lane_b8:
+  case SVE::BI__builtin_sve_svpsel_lane_b16:
+  case SVE::BI__builtin_sve_svpsel_lane_b32:
+  case SVE::BI__builtin_sve_svpsel_lane_b64:
+  case SVE::BI__builtin_sve_svpsel_lane_c8:
+  case SVE::BI__builtin_sve_svpsel_lane_c16:
+  case SVE::BI__builtin_sve_svpsel_lane_c32:
+  case SVE::BI__builtin_sve_svpsel_lane_c64: {
+    bool IsSVCount = isa<TargetExtType>(Ops[0]->getType());
+    assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
+                               "aarch64.svcount")) &&
+           "Unexpected TargetExtType");
+    auto SVCountTy =
+        llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    Function *CastFromSVCountF =
+        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
+    Function *CastToSVCountF =
+        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
+
+    auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier));
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy);
+    llvm::Value *Ops0 =
+        IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0];
+    llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy);
+    llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops[2]});
+    return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel;
+  }
   case SVE::BI__builtin_sve_svmov_b_z: {
     // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
     SVETypeFlags TypeFlags(Builtin->TypeModifier);
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 26bc45e37b24174..ecff8ddc4ee766f 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -222,6 +222,14 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below.
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(getToolChain(), Args, CmdArgs);
+      addFortranRuntimeLibs(getToolChain(), CmdArgs);
+      CmdArgs.push_back("-lm");
+    }
     if (Args.hasArg(options::OPT_fstack_protector) ||
         Args.hasArg(options::OPT_fstack_protector_strong) ||
         Args.hasArg(options::OPT_fstack_protector_all)) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 31b7e6cc8b8922a..cd7c26a84b6cce0 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -14331,6 +14331,18 @@ static bool AnalyzeBitFieldAssignment(Sema &S, FieldDecl *Bitfield, Expr *Init,
         S.Diag(WidthExpr->getExprLoc(), diag::note_widen_bitfield)
             << BitsNeeded << ED << WidthExpr->getSourceRange();
       }
+    } else if (OriginalInit->getType()->isIntegralType(S.Context)) {
+      IntRange LikelySourceRange =
+          GetExprRange(S.Context, Init, S.isConstantEvaluatedContext(),
+                       /*Approximate=*/true);
+
+      if (LikelySourceRange.Width > FieldWidth) {
+        Expr *WidthExpr = Bitfield->getBitWidth();
+        S.Diag(InitLoc, diag::warn_bitfield_too_small_for_integral_type)
+            << Bitfield << FieldWidth << OriginalInit->getType()
+            << LikelySourceRange.Width;
+        S.Diag(WidthExpr->getExprLoc(), diag::note_declared_at);
+      }
     }
 
     return false;
@@ -15228,7 +15240,6 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
 
   if (LikelySourceRange.Width > TargetRange.Width) {
     // If the source is a constant, use a default-on diagnostic.
-    // TODO: this should happen for bitfield stores, too.
     Expr::EvalResult Result;
     if (E->EvaluateAsInt(Result, S.Context, Expr::SE_AllowSideEffects,
                          S.isConstantEvaluatedContext())) {
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 0ef03293b46ffb7..719c6aab74e0173 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -657,11 +657,11 @@ Sema::SetupConstraintCheckingTemplateArgumentsAndScope(
   // Collect the list of template arguments relative to the 'primary' template.
   // We need the entire list, since the constraint is completely uninstantiated
   // at this point.
-  MLTAL =
-      getTemplateInstantiationArgs(FD, /*Final=*/false, /*Innermost=*/nullptr,
-                                   /*RelativeToPrimary=*/true,
-                                   /*Pattern=*/nullptr,
-                                   /*ForConstraintInstantiation=*/true);
+  MLTAL = getTemplateInstantiationArgs(FD, FD->getLexicalDeclContext(),
+                                       /*Final=*/false, /*Innermost=*/nullptr,
+                                       /*RelativeToPrimary=*/true,
+                                       /*Pattern=*/nullptr,
+                                       /*ForConstraintInstantiation=*/true);
   if (SetupConstraintScope(FD, TemplateArgs, MLTAL, Scope))
     return std::nullopt;
 
@@ -736,7 +736,8 @@ static unsigned
 CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
                                      bool SkipForSpecialization = false) {
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      ND, /*Final=*/false, /*Innermost=*/nullptr, /*RelativeToPrimary=*/true,
+      ND, ND->getLexicalDeclContext(), /*Final=*/false, /*Innermost=*/nullptr,
+      /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr,
       /*ForConstraintInstantiation=*/true, SkipForSpecialization);
   return MLTAL.getNumLevels();
@@ -770,28 +771,31 @@ namespace {
   };
 } // namespace
 
-static const Expr *SubstituteConstraintExpression(Sema &S, const NamedDecl *ND,
-                                                  const Expr *ConstrExpr) {
+static const Expr *
+SubstituteConstraintExpression(Sema &S,
+                               const Sema::TemplateCompareNewDeclInfo &DeclInfo,
+                               const Expr *ConstrExpr) {
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      ND, /*Final=*/false, /*Innermost=*/nullptr,
+      DeclInfo.getDecl(), DeclInfo.getLexicalDeclContext(), /*Final=*/false,
+      /*Innermost=*/nullptr,
       /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true,
       /*SkipForSpecialization*/ false);
+
   if (MLTAL.getNumSubstitutedLevels() == 0)
     return ConstrExpr;
 
   Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/false);
 
   Sema::InstantiatingTemplate Inst(
-      S, ND->getLocation(),
+      S, DeclInfo.getLocation(),
       Sema::InstantiatingTemplate::ConstraintNormalization{},
-      const_cast<NamedDecl *>(ND), SourceRange{});
-
+      const_cast<NamedDecl *>(DeclInfo.getDecl()), SourceRange{});
   if (Inst.isInvalid())
     return nullptr;
 
   std::optional<Sema::CXXThisScopeRAII> ThisScope;
-  if (auto *RD = dyn_cast<CXXRecordDecl>(ND->getDeclContext()))
+  if (auto *RD = dyn_cast<CXXRecordDecl>(DeclInfo.getDeclContext()))
     ThisScope.emplace(S, const_cast<CXXRecordDecl *>(RD), Qualifiers());
   ExprResult SubstConstr =
       S.SubstConstraintExpr(const_cast<clang::Expr *>(ConstrExpr), MLTAL);
@@ -802,13 +806,13 @@ static const Expr *SubstituteConstraintExpression(Sema &S, const NamedDecl *ND,
 
 bool Sema::AreConstraintExpressionsEqual(const NamedDecl *Old,
                                          const Expr *OldConstr,
-                                         const NamedDecl *New,
+                                         const TemplateCompareNewDeclInfo &New,
                                          const Expr *NewConstr) {
   if (OldConstr == NewConstr)
     return true;
   // C++ [temp.constr.decl]p4
-  if (Old && New && Old != New &&
-      Old->getLexicalDeclContext() != New->getLexicalDeclContext()) {
+  if (Old && !New.isInvalid() && !New.ContainsDecl(Old) &&
+      Old->getLexicalDeclContext() != New.getLexicalDeclContext()) {
     if (const Expr *SubstConstr =
             SubstituteConstraintExpression(*this, Old, OldConstr))
       OldConstr = SubstConstr;
@@ -1252,7 +1256,8 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
   TemplateArgumentList TAL{TemplateArgumentList::OnStack,
                            CSE->getTemplateArguments()};
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      CSE->getNamedConcept(), /*Final=*/false, &TAL,
+      CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
+      /*Final=*/false, &TAL,
       /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr,
       /*ForConstraintInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index c271cebb9eb638f..ce78994e6553814 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -7688,7 +7688,7 @@ bool Sema::CheckNonDependentConversions(
     QualType ParamType = ParamTypes[I + Offset];
     if (!ParamType->isDependentType()) {
       unsigned ConvIdx = PO == OverloadCandidateParamOrder::Reversed
-                             ? Args.size() - 1 - (ThisConversions + I)
+                             ? 0
                              : (ThisConversions + I);
       Conversions[ConvIdx]
         = TryCopyInitialization(*this, Args[I], ParamType,
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 6389ec708bf34ae..f0197f7c102a857 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1995,10 +1995,13 @@ DeclResult Sema::CheckClassTemplate(
     // for a friend in a dependent context: the template parameter list itself
     // could be dependent.
     if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
-        !TemplateParameterListsAreEqual(TemplateParams,
-                                   PrevClassTemplate->getTemplateParameters(),
-                                        /*Complain=*/true,
-                                        TPL_TemplateMatch))
+        !TemplateParameterListsAreEqual(
+            TemplateCompareNewDeclInfo(SemanticContext ? SemanticContext
+                                                       : CurContext,
+                                       CurContext, KWLoc),
+            TemplateParams, PrevClassTemplate,
+            PrevClassTemplate->getTemplateParameters(), /*Complain=*/true,
+            TPL_TemplateMatch))
       return true;
 
     // C++ [temp.class]p4:
@@ -6203,7 +6206,7 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, /*Final=*/false, &StackTemplateArgs,
+        Template, NewContext, /*Final=*/false, &StackTemplateArgs,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
@@ -8017,7 +8020,8 @@ Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
 
 /// Match two template parameters within template parameter lists.
 static bool MatchTemplateParameterKind(
-    Sema &S, NamedDecl *New, const NamedDecl *NewInstFrom, NamedDecl *Old,
+    Sema &S, NamedDecl *New,
+    const Sema::TemplateCompareNewDeclInfo &NewInstFrom, NamedDecl *Old,
     const NamedDecl *OldInstFrom, bool Complain,
     Sema::TemplateParameterListEqualKind Kind, SourceLocation TemplateArgLoc) {
   // Check the actual kind (type, non-type, template).
@@ -8105,8 +8109,8 @@ static bool MatchTemplateParameterKind(
   // For template template parameters, check the template parameter types.
   // The template parameter lists of template template
   // parameters must agree.
-  else if (TemplateTemplateParmDecl *OldTTP
-                                    = dyn_cast<TemplateTemplateParmDecl>(Old)) {
+  else if (TemplateTemplateParmDecl *OldTTP =
+               dyn_cast<TemplateTemplateParmDecl>(Old)) {
     TemplateTemplateParmDecl *NewTTP = cast<TemplateTemplateParmDecl>(New);
     if (!S.TemplateParameterListsAreEqual(
             NewInstFrom, NewTTP->getTemplateParameters(), OldInstFrom,
@@ -8210,7 +8214,7 @@ void DiagnoseTemplateParameterListArityMismatch(Sema &S,
 /// \returns True if the template parameter lists are equal, false
 /// otherwise.
 bool Sema::TemplateParameterListsAreEqual(
-    const NamedDecl *NewInstFrom, TemplateParameterList *New,
+    const TemplateCompareNewDeclInfo &NewInstFrom, TemplateParameterList *New,
     const NamedDecl *OldInstFrom, TemplateParameterList *Old, bool Complain,
     TemplateParameterListEqualKind Kind, SourceLocation TemplateArgLoc) {
   if (Old->size() != New->size() && Kind != TPL_TemplateTemplateArgumentMatch) {
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 62fbd903a04044b..8f115f2177846b1 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -2889,7 +2889,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
                                   CanonicalDeducedArgs};
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      Template, /*Final=*/false,
+      Template, Template->getDeclContext(), /*Final=*/false,
       /*InnerMost=*/NeedsReplacement ? nullptr : &DeducedTAL,
       /*RelativeToPrimary=*/true, /*Pattern=*/
       nullptr, /*ForConstraintInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index d7d5ce19b75a965..effc97a033c5eee 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -312,6 +312,10 @@ Response HandleGenericDeclContext(const Decl *CurDecl) {
 /// \param ND the declaration for which we are computing template instantiation
 /// arguments.
 ///
+/// \param DC In the event we don't HAVE a declaration yet, we instead provide
+///  the decl context where it will be created.  In this case, the `Innermost`
+///  should likely be provided.  If ND is non-null, this is ignored.
+///
 /// \param Innermost if non-NULL, specifies a template argument list for the
 /// template declaration passed as ND.
 ///
@@ -331,10 +335,11 @@ Response HandleGenericDeclContext(const Decl *CurDecl) {
 /// arguments on an enclosing class template.
 
 MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
-    const NamedDecl *ND, bool Final, const TemplateArgumentList *Innermost,
-    bool RelativeToPrimary, const FunctionDecl *Pattern,
-    bool ForConstraintInstantiation, bool SkipForSpecialization) {
-  assert(ND && "Can't find arguments for a decl if one isn't provided");
+    const NamedDecl *ND, const DeclContext *DC, bool Final,
+    const TemplateArgumentList *Innermost, bool RelativeToPrimary,
+    const FunctionDecl *Pattern, bool ForConstraintInstantiation,
+    bool SkipForSpecialization) {
+  assert((ND || DC) && "Can't find arguments for a decl if one isn't provided");
   // Accumulate the set of template argument lists in this structure.
   MultiLevelTemplateArgumentList Result;
 
@@ -346,6 +351,9 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
     CurDecl = Response::UseNextDecl(ND).NextDecl;
   }
 
+  if (!ND)
+    CurDecl = Decl::castFromDeclContext(DC);
+
   while (!CurDecl->isFileContextDecl()) {
     Response R;
     if (const auto *VarTemplSpec =
@@ -369,6 +377,8 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
       R = HandleImplicitConceptSpecializationDecl(CSD, Result);
     } else if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(CurDecl)) {
       R = HandleFunctionTemplateDecl(FTD, Result);
+    } else if (const auto *CTD = dyn_cast<ClassTemplateDecl>(CurDecl)) {
+      R = Response::ChangeDecl(CTD->getLexicalDeclContext());
     } else if (!isa<DeclContext>(CurDecl)) {
       R = Response::DontClearRelativeToPrimaryNextDecl(CurDecl);
       if (CurDecl->getDeclContext()->isTranslationUnit()) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ec0f7d1fe0ddd8e..78a7892a35a320b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -4626,7 +4626,8 @@ bool Sema::InstantiateDefaultArgument(SourceLocation CallLoc, FunctionDecl *FD,
   // template<typename T>
   // A<T> Foo(int a = A<T>::FooImpl());
   MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs(
-      FD, /*Final=*/false, nullptr, /*RelativeToPrimary=*/true);
+      FD, FD->getLexicalDeclContext(), /*Final=*/false, nullptr,
+      /*RelativeToPrimary=*/true);
 
   if (SubstDefaultArgument(CallLoc, Param, TemplateArgs, /*ForCallExpr*/ true))
     return true;
@@ -4665,7 +4666,8 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
   LocalInstantiationScope Scope(*this);
 
   MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs(
-      Decl, /*Final=*/false, nullptr, /*RelativeToPrimary*/ true);
+      Decl, Decl->getLexicalDeclContext(), /*Final=*/false, nullptr,
+      /*RelativeToPrimary*/ true);
 
   // FIXME: We can't use getTemplateInstantiationPattern(false) in general
   // here, because for a non-defining friend declaration in a class template,
@@ -5107,7 +5109,8 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
     SetDeclDefaulted(Function, PatternDecl->getLocation());
   } else {
     MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs(
-        Function, /*Final=*/false, nullptr, false, PatternDecl);
+        Function, Function->getLexicalDeclContext(), /*Final=*/false, nullptr,
+        false, PatternDecl);
 
     // Substitute into the qualifier; we can get a substitution failure here
     // through evil use of alias templates.
diff --git a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
index 02fe37dc1be5058..5c6804eb7726b5f 100644
--- a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
+++ b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp
@@ -324,41 +324,6 @@ bool x = X() == X(); // expected-warning {{ambiguous}}
 }
 } // namespace P2468R2
 
-namespace GH53954{
-namespace test1 {
-struct P {
-    template <class T>
-    friend bool operator==(const P&, const T&); // expected-note {{candidate}} \
-                                                  // expected-note {{reversed parameter order}}
-};
-struct A : public P {};
-struct B : public P {};
-bool check(A a, B b) { return a == b; } // expected-error {{ '==' is ambiguous}}
-}
-
-namespace test2 {
-struct P {
-    template <class T>
-    friend bool operator==(const T&, const P&); // expected-note {{candidate}} \
-                                                // expected-note {{reversed parameter order}}
-};
-struct A : public P {};
-struct B : public P {};
-bool check(A a, B b) { return a == b; } // expected-error {{ '==' is ambiguous}}
-}
-
-namespace test3 {
-struct P {
-  template<class S>
-  bool operator==(const S &) const; // expected-note {{candidate}} \
-                                    // expected-note {{reversed parameter order}}
-};
-struct A : public P {};
-struct B : public P {};
-bool check(A a, B b) { return a == b; } // expected-error {{ '==' is ambiguous}}
-}
-}
-
 #else // NO_ERRORS
 
 namespace problem_cases {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
new file mode 100644
index 000000000000000..97354d75d7b8743
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_psel.c
@@ -0,0 +1,165 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN:   -target-feature +sve2p1 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sve.h>
+
+// CHECK-LABEL: @test_svpsel_lane_b8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_b8u10__SVBool_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+svbool_t test_svpsel_lane_b8(svbool_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_b8(p1, p2, idx + 15);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_b16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 8 x i1> [[TMP0]], i32 [[ADD]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b16u10__SVBool_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 7
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 8 x i1> [[TMP0]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+svbool_t test_svpsel_lane_b16(svbool_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_b16(p1, p2, idx + 7);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_b32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 4 x i1> [[TMP0]], i32 [[ADD]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b32u10__SVBool_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 4 x i1> [[TMP0]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+svbool_t test_svpsel_lane_b32(svbool_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_b32(p1, p2, idx + 3);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_b64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 2 x i1> [[TMP0]], i32 [[ADD]])
+// CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_b64u10__SVBool_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[P1:%.*]], <vscale x 2 x i1> [[TMP0]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
+//
+svbool_t test_svpsel_lane_b64(svbool_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_b64(p1, p2, idx + 1);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_c8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP1]])
+// CHECK-NEXT:    ret target("aarch64.svcount") [[TMP2]]
+//
+// CPP-CHECK-LABEL: @_Z19test_svpsel_lane_c8u11__SVCount_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 15
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv16i1(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1> [[P2:%.*]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP1]])
+// CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP2]]
+//
+svcount_t test_svpsel_lane_c8(svcount_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_c8(p1, p2, idx + 15);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_c16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], i32 [[ADD]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c16u11__SVCount_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 7
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv8i1(<vscale x 16 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+svcount_t test_svpsel_lane_c16(svcount_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_c16(p1, p2, idx + 7);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_c32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], i32 [[ADD]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c32u11__SVCount_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 3
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv4i1(<vscale x 16 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+svcount_t test_svpsel_lane_c32(svcount_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_c32(p1, p2, idx + 3);
+}
+
+// CHECK-LABEL: @test_svpsel_lane_c64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[P2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], i32 [[ADD]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svpsel_lane_c64u11__SVCount_tu10__SVBool_tj(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IDX:%.*]], 1
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[P1:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[P2:%.*]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.psel.nxv2i1(<vscale x 16 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], i32 [[ADD]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt(<vscale x 16 x i1> [[TMP2]])
+// CPP-CHECK-NEXT:    ret target("aarch64.svcount") [[TMP3]]
+//
+svcount_t test_svpsel_lane_c64(svcount_t p1, svbool_t p2, uint32_t idx) {
+  return svpsel_lane_c64(p1, p2, idx + 1);
+}
diff --git a/clang/test/SemaCXX/bitfield-width.c b/clang/test/SemaCXX/bitfield-width.c
new file mode 100644
index 000000000000000..7b4e4444c245b0e
--- /dev/null
+++ b/clang/test/SemaCXX/bitfield-width.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -Wconversion -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wbitfield-conversion -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple armebv7-unknown-linux -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm64-unknown-linux  -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm-unknown-linux -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -Wbitfield-conversion \
+// RUN:     -fsyntax-only -verify %s
+
+typedef struct _xx {
+     int bf:9; // expected-note 3{{declared here}}
+ } xx, *pxx; 
+
+ xx vxx;
+
+ void foo1(int x) {     
+     vxx.bf = x; // expected-warning{{conversion from 'int' (32 bits) to bit-field 'bf' (9 bits) may change value}}
+ } 
+ void foo2(short x) {     
+     vxx.bf = x; // expected-warning{{conversion from 'short' (16 bits) to bit-field 'bf' (9 bits) may change value}}
+ } 
+ void foo3(char x) {     
+     vxx.bf = x; // no warning expected
+ } 
+ void foo4(short x) {     
+     vxx.bf = 0xff & x; // no warning expected 
+ } 
+ void foo5(short x) {     
+     vxx.bf = 0x1ff & x; // no warning expected 
+ } 
+ void foo6(short x) {     
+     vxx.bf = 0x3ff & x; // expected-warning{{conversion from 'int' (10 bits) to bit-field 'bf' (9 bits) may change value}}
+ } 
+ int fee(void) {
+     return 0;
+ }
diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
index f067c02ca48f584..ed2d50d7e0a6b1c 100644
--- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
+++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
@@ -466,3 +466,40 @@ int Outermost<T>::Middle::Innermost<U>::func(Param param) const {
 }
 
 } // namespace GH65810
+
+namespace GH61763 {
+template<typename T, typename U>
+concept same_as = true;
+
+template <class = void>
+struct Foo {
+      template <same_as<void> Param>
+            friend struct Bar;
+};
+
+template struct Foo<>;
+
+template <same_as<void> Param>
+struct Bar {
+};
+
+
+template<typename T>
+concept ok = true;
+
+struct outer {
+    template<typename T>
+        requires ok<T>
+          struct foo {};
+};
+
+template<typename U>
+struct bar {
+    template<typename T>
+        requires ok<T>
+          friend struct outer::foo;
+};
+
+bar<int> x;
+} // namespace GH61763
+
diff --git a/compiler-rt/test/hwasan/TestCases/deep-recursion.c b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
index 19d2b50726bee51..764ec5692372b4a 100644
--- a/compiler-rt/test/hwasan/TestCases/deep-recursion.c
+++ b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
@@ -17,7 +17,8 @@
 // Stack histories are currently not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
-#include <stdint.h>
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
 
 // At least -O1 is needed for this function to not have a stack frame on
@@ -33,11 +34,12 @@ __attribute__((noinline)) void OOB() {
   int y[4];
 
   // Tags for stack-allocated variables can occasionally be zero, resulting in
-  // a false negative for this test. This is not easy to fix, hence we work
-  // around it: if the tag is zero, we use the neighboring variable instead,
-  // which must have a different (hence non-zero) tag.
-  // This tag check assumes aarch64.
-  if (((uintptr_t)&x) >> 56 == 0) {
+  // a false negative for this test. The tag allocation algorithm is not easy
+  // to fix, hence we work around it: if the tag is zero, we use the
+  // neighboring variable instead, which must have a different (hence non-zero)
+  // tag.
+  if (__hwasan_tag_pointer(x, 0) == x) {
+    assert(__hwasan_tag_pointer(y, 0) != y);
     y[four] = 0;
   } else {
     x[four] = 0;
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index b4a817351029d9b..48440a47d5f5f42 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -9,14 +9,29 @@
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+
 void USE(void *x) { // pretend_to_do_something(void *x)
   __asm__ __volatile__("" : : "r" (x) : "memory");
 }
 
 __attribute__((noinline))
 char *buggy() {
-  char zzz[0x1000];
-  char *volatile p = zzz;
+  char zzz[0x800];
+  char yyy[0x800];
+  // Tags for stack-allocated variables can occasionally be zero, resulting in
+  // a false negative for this test. The tag allocation algorithm is not easy
+  // to fix, hence we work around it: if the tag is zero, we use the
+  // neighboring variable instead, which must have a different (hence non-zero)
+  // tag.
+  char *volatile p;
+  if (__hwasan_tag_pointer(zzz, 0) == zzz) {
+    assert(__hwasan_tag_pointer(yyy, 0) != yyy);
+    p = yyy;
+  } else {
+    p = zzz;
+  }
   return p;
 }
 
@@ -35,7 +50,7 @@ int main() {
   // CHECK: Cause: stack tag-mismatch
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
-  // CHECK-NEXT: zzz in buggy {{.*}}stack-uar.c:[[@LINE-20]]
+  // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uar.c:
   // CHECK-NEXT: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index faa5164f52573ce..c8dcc91064415fa 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1854,9 +1854,10 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *privateClause =
                    std::get_if<Fortran::parser::AccClause::Private>(
                        &clause.u)) {
-      genPrivatizations<mlir::acc::PrivateRecipeOp>(
-          privateClause->v, converter, semanticsContext, stmtCtx,
-          privateOperands, privatizations);
+      if (!outerCombined)
+        genPrivatizations<mlir::acc::PrivateRecipeOp>(
+            privateClause->v, converter, semanticsContext, stmtCtx,
+            privateOperands, privatizations);
     } else if (const auto *firstprivateClause =
                    std::get_if<Fortran::parser::AccClause::Firstprivate>(
                        &clause.u)) {
@@ -1866,8 +1867,9 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *reductionClause =
                    std::get_if<Fortran::parser::AccClause::Reduction>(
                        &clause.u)) {
-      genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
-                    reductionOperands, reductionRecipes);
+      if (!outerCombined)
+        genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
+                      reductionOperands, reductionRecipes);
     } else if (const auto *defaultClause =
                    std::get_if<Fortran::parser::AccClause::Default>(
                        &clause.u)) {
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 09b8a224df13828..213bc032d964504 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -2,15 +2,16 @@
 ! invocation. These libraries are added on top of other standard runtime
 ! libraries that the Clang driver will include.
 
-! RUN: %flang -### -target ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,GNU
-! RUN: %flang -### -target aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN
-! RUN: %flang -### -target x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
+! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN
+! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'libcmt' and
 !       'oldnames' on Windows, but they are not needed when compiling
 !       Fortran code and they might bring in additional dependencies.
 !       Make sure they're not added.
-! RUN: %flang -### -target aarch64-windows-msvc -fuse-ld= %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MSVC --implicit-check-not libcmt --implicit-check-not oldnames
+! RUN: %flang -### --target=aarch64-windows-msvc -fuse-ld= %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MSVC --implicit-check-not libcmt --implicit-check-not oldnames
 
 ! Compiler invocation to generate the object file
 ! CHECK-LABEL: {{.*}} "-emit-obj"
@@ -21,12 +22,9 @@
 !       run on any other platform, such as Windows that use a .exe
 !       suffix. Clang's driver will try to resolve the path to the ld
 !       executable and may find the GNU linker from MinGW or Cygwin.
-! GNU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
-! GNU-SAME: "[[object_file]]"
-! GNU-SAME: -lFortran_main
-! GNU-SAME: -lFortranRuntime
-! GNU-SAME: -lFortranDecimal
-! GNU-SAME: -lm
+! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
+! UNIX-SAME: "[[object_file]]"
+! UNIX-SAME: "-lFortran_main" "-lFortranRuntime" "-lFortranDecimal" "-lm"
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 80b1272bd1b10b6..eea4950b6d38f92 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -483,11 +483,9 @@ subroutine acc_parallel_loop
     a(i) = b(i)
   END DO
 
-! FIR:        %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! HLFIR:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! FIR:        %[[ACC_PRIVATE_B:.*]] = acc.firstprivate varPtr(%[[B]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! HLFIR:      %[[ACC_PRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
-! CHECK:      acc.parallel firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_PRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:      acc.parallel firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_PRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) {
 ! FIR:        %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! HLFIR:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:        acc.loop private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
@@ -772,7 +770,7 @@ subroutine acc_parallel_loop
     reduction_i = 1
   end do
 
-! CHECK:      acc.parallel reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
+! CHECK:      acc.parallel {
 ! CHECK:        acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 9ce1828e63ddf10..80b474b348c1c2c 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -268,9 +268,10 @@ subroutine acc_private_assumed_shape(a, n)
 ! CHECK-LABEL: func.func @_QPacc_private_assumed_shape(
 ! CHECK-SAME:    %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}
 ! HLFIR: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFacc_private_assumed_shapeEa"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! HLFIR: acc.parallel {
 ! HLFIR: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! HLFIR: %[[PRIVATE:.*]] = acc.private varPtr(%[[ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>) {
+! HLFIR: acc.loop private(@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>) {
 
 subroutine acc_private_allocatable_array(a, n)
   integer, allocatable :: a(:)
@@ -289,10 +290,11 @@ subroutine acc_private_allocatable_array(a, n)
 ! CHECK-LABEL: func.func @_QPacc_private_allocatable_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "a"}
 ! HLFIR: %[[DECLA_A:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_private_allocatable_arrayEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! HLFIR: acc.parallel {
 ! HLFIR: %[[BOX:.*]] = fir.load %[[DECLA_A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! HLFIR: %[[PRIVATE:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.heap<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>)
+! HLFIR: acc.loop private(@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>)
 ! HLFIR: acc.serial private(@privatization_box_heap_Uxi32 -> %{{.*}} : !fir.heap<!fir.array<?xi32>>)
 
 subroutine acc_private_pointer_array(a, n)
@@ -308,10 +310,11 @@ subroutine acc_private_pointer_array(a, n)
 ! CHECK-LABEL: func.func @_QPacc_private_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! HLFIR: %[[DECL_A:.*]]:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_private_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! HLFIR: acc.parallel {
 ! HLFIR: %[[BOX:.*]] = fir.load %[[DECLA_A]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
 ! HLFIR: %[[PRIVATE:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ptr<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@privatization_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ptr<!fir.array<?xi32>>)
+! HLFIR: acc.loop private(@privatization_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ptr<!fir.array<?xi32>>)
 
 subroutine acc_private_dynamic_extent(a, n)
   integer :: n, i
@@ -327,8 +330,9 @@ subroutine acc_private_dynamic_extent(a, n)
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?x?x2xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
 ! HLFIR: %[[DECL_N:.*]]:2 = hlfir.declare %arg1 {uniq_name = "_QFacc_private_dynamic_extentEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! HLFIR: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%16) {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
+! HLFIR: acc.parallel {
 ! HLFIR: %[[PRIV:.*]] = acc.private varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?x?x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<?x?x2xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@privatization_ref_UxUx2xi32 -> %[[PRIV]] : !fir.ref<!fir.array<?x?x2xi32>>)
+! HLFIR: acc.loop private(@privatization_ref_UxUx2xi32 -> %[[PRIV]] : !fir.ref<!fir.array<?x?x2xi32>>)
 
 subroutine acc_firstprivate_assumed_shape(a, n)
   integer :: a(:), i, n
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 466c679320a94ea..fb7e3615b698c1c 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -3,22 +3,23 @@
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,HLFIR
 
-! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_section_ext10_ref_10xf32 : !fir.ref<!fir.array<10xf32>> init {
+! CHECK-LABEL: acc.private.recipe @privatization_ref_10xf32 : !fir.ref<!fir.array<10xf32>> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10xf32>>):
 ! HLFIR:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<10xf32>
+! HLFIR:   %[[ALLOCA:.*]] = fir.alloca !fir.array<10xf32>
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.private.init"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<10xf32>>
-! CHECK: } copy {
-! CHECK:  ^bb0(%arg0: !fir.ref<!fir.array<10xf32>>, %arg1: !fir.ref<!fir.array<10xf32>>):
-! CHECK:   acc.terminator
 ! CHECK: }
 
-! CHECK-LABEL: acc.private.recipe @privatization_ref_10xf32 : !fir.ref<!fir.array<10xf32>> init {
+! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_section_ext10_ref_10xf32 : !fir.ref<!fir.array<10xf32>> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10xf32>>):
 ! HLFIR:   %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.array<10xf32>
 ! HLFIR:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]](%[[SHAPE]]) {uniq_name = "acc.private.init"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! HLFIR:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.array<10xf32>>
+! CHECK: } copy {
+! CHECK:  ^bb0(%arg0: !fir.ref<!fir.array<10xf32>>, %arg1: !fir.ref<!fir.array<10xf32>>):
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: func.func @_QPacc_serial_loop()
@@ -417,11 +418,9 @@ subroutine acc_serial_loop
     a(i) = b(i)
   END DO
 
-! FIR:        %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! HLFIR:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! FIR:        %[[ACC_FPRIVATE_B:.*]] = acc.firstprivate varPtr(%[[B]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! HLFIR:      %[[ACC_FPRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
-! CHECK:      acc.serial firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_FPRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK:      acc.serial firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_FPRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) {
 ! FIR:        %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[A]] : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! HLFIR:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:        acc.loop private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
@@ -706,7 +705,7 @@ subroutine acc_serial_loop
     reduction_i = 1
   end do
 
-! CHECK:      acc.serial reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
+! CHECK:      acc.serial {
 ! CHECK:        acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 00e583903f1b455..df091613dc0a144 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -613,7 +613,6 @@ void LinkerScript::processSectionCommands() {
         discard(*s);
       discardSynthetic(*osec);
       osec->commands.clear();
-      seenDiscard = true;
       return false;
     }
 
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index c97fdfab1d2f21c..18eaf58b785e370 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -356,7 +356,6 @@ class LinkerScript final {
 
   bool hasSectionsCommand = false;
   bool seenDataAlign = false;
-  bool seenDiscard = false;
   bool seenRelroEnd = false;
   bool errorOnMissingSection = false;
   std::string backwardDotErr;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6f00c7ff8c0d1a8..57e1aa06c6aa873 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -53,7 +53,6 @@ template <class ELFT> class Writer {
   void run();
 
 private:
-  void copyLocalSymbols();
   void addSectionSymbols();
   void sortSections();
   void resolveShfLinkOrder();
@@ -292,18 +291,6 @@ static void demoteSymbolsAndComputeIsPreemptible() {
   }
 }
 
-static void demoteLocalSymbolsInDiscardedSections() {
-  llvm::TimeTraceScope timeScope("Demote local symbols");
-  parallelForEach(ctx.objectFiles, [&](ELFFileBase *file) {
-    DenseMap<SectionBase *, size_t> sectionIndexMap;
-    for (Symbol *sym : file->getLocalSymbols()) {
-      Defined *d = dyn_cast<Defined>(sym);
-      if (d && d->section && !d->section->isLive())
-        demoteDefined(*d, sectionIndexMap);
-    }
-  });
-}
-
 // Fully static executables don't support MTE globals at this point in time, as
 // we currently rely on:
 //   - A dynamic loader to process relocations, and
@@ -598,11 +585,6 @@ template <class ELFT> void elf::createSyntheticSections() {
 
 // The main function of the writer.
 template <class ELFT> void Writer<ELFT>::run() {
-  copyLocalSymbols();
-
-  if (config->copyRelocs)
-    addSectionSymbols();
-
   // Now that we have a complete set of output sections. This function
   // completes section contents. For example, we need to add strings
   // to the string table, and add entries to .got and .plt.
@@ -751,31 +733,33 @@ bool lld::elf::includeInSymtab(const Symbol &b) {
     SectionBase *sec = d->section;
     if (!sec)
       return true;
+    assert(sec->isLive());
 
     if (auto *s = dyn_cast<MergeInputSection>(sec))
       return s->getSectionPiece(d->value).live;
-    return sec->isLive();
+    return true;
   }
   return b.used || !config->gcSections;
 }
 
-// Local symbols are not in the linker's symbol table. This function scans
-// each object file's symbol table to copy local symbols to the output.
-template <class ELFT> void Writer<ELFT>::copyLocalSymbols() {
-  if (!in.symTab)
-    return;
+// Scan local symbols to:
+//
+// - demote symbols defined relative to /DISCARD/ discarded input sections so
+//   that relocations referencing them will lead to errors.
+// - copy eligible symbols to .symTab
+static void demoteAndCopyLocalSymbols() {
   llvm::TimeTraceScope timeScope("Add local symbols");
-  if (config->copyRelocs && config->discard != DiscardPolicy::None)
-    markUsedLocalSymbols<ELFT>();
   for (ELFFileBase *file : ctx.objectFiles) {
+    DenseMap<SectionBase *, size_t> sectionIndexMap;
     for (Symbol *b : file->getLocalSymbols()) {
       assert(b->isLocal() && "should have been caught in initializeSymbols()");
       auto *dr = dyn_cast<Defined>(b);
-
-      // No reason to keep local undefined symbol in symtab.
       if (!dr)
         continue;
-      if (includeInSymtab(*b) && shouldKeepInSymtab(*dr))
+
+      if (dr->section && !dr->section->isLive())
+        demoteDefined(*dr, sectionIndexMap);
+      else if (in.symTab && includeInSymtab(*b) && shouldKeepInSymtab(*dr))
         in.symTab->addSymbol(b);
     }
   }
@@ -1991,12 +1975,13 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   }
 
   demoteSymbolsAndComputeIsPreemptible();
-  // Also demote local symbols defined relative to discarded input sections so
-  // that relocations referencing them will lead to errors. To avoid unneeded
-  // work, we only do this when /DISCARD/ is seen, but this demotation also
-  // applies to --gc-sections discarded sections.
-  if (script->seenDiscard)
-    demoteLocalSymbolsInDiscardedSections();
+
+  if (config->copyRelocs && config->discard != DiscardPolicy::None)
+    markUsedLocalSymbols<ELFT>();
+  demoteAndCopyLocalSymbols();
+
+  if (config->copyRelocs)
+    addSectionSymbols();
 
   // Change values of linker-script-defined symbols from placeholders (assigned
   // by declareSymbols) to actual definitions.
diff --git a/lld/test/ELF/gc-sections-tls.s b/lld/test/ELF/gc-sections-tls.s
index edcf30e264909e0..3036a676dde1235 100644
--- a/lld/test/ELF/gc-sections-tls.s
+++ b/lld/test/ELF/gc-sections-tls.s
@@ -1,31 +1,25 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
 
-## Relocation in a non .debug_* referencing a discarded TLS symbol is invalid.
-## If we happen to have no PT_TLS, we will emit an error.
-# RUN: not ld.lld %t.o --gc-sections -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
-
-# ERR: error: {{.*}}.o has an STT_TLS symbol but doesn't have an SHF_TLS section
-
-## TODO As a corner case, when /DISCARD/ is present, demoteLocalSymbolsInDiscardedSections
-## demotes tls and the error is not triggered.
-# RUN: echo 'SECTIONS { /DISCARD/ : {} }' > %t.lds
-# RUN: ld.lld %t.o --gc-sections -T %t.lds -o /dev/null
+## When a TLS section is discarded, we will resolve the relocation in a non-SHF_ALLOC
+## section to the addend. Technically, we can emit an error in this case as the
+## relocation type is not TLS.
+# RUN: ld.lld %t.o --gc-sections -o %t
+# RUN: llvm-readelf -x .noalloc %t | FileCheck %s
 
-## If we happen to have a PT_TLS, we will resolve the relocation to
-## an arbitrary value (current implementation uses a negative value).
 # RUN: echo '.section .tbss,"awT"; .globl root; root: .long 0' | \
 # RUN:   llvm-mc -filetype=obj -triple=x86_64 - -o %t1.o
 # RUN: ld.lld --gc-sections -u root %t.o %t1.o -o %t
 # RUN: llvm-readelf -x .noalloc %t | FileCheck %s
 
 # CHECK:      Hex dump of section '.noalloc':
-# CHECK-NEXT: 0x00000000 {{[0-9a-f]+}} ffffffff
+# CHECK-NEXT: 0x00000000 00800000 00000000
 
 .globl _start
 _start:
 
 .section .tbss,"awT",@nobits
+  .long 0
 tls:
   .long 0
 
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index d7b4f2587a98bf9..9073e3e9b2ee33d 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -12,7 +12,7 @@ set(LLDB_LIBEDIT_LIBS)
 if (LLDB_ENABLE_CURSES)
   list(APPEND LLDB_CURSES_LIBS ${PANEL_LIBRARIES} ${CURSES_LIBRARIES})
   if(LLVM_ENABLE_TERMINFO)
-    list(APPEND LLDB_CURSES_LIBS ${TERMINFO_LIB})
+    list(APPEND LLDB_CURSES_LIBS ${Terminfo_LIBRARIES})
   endif()
   if (LLVM_BUILD_STATIC)
     list(APPEND LLDB_CURSES_LIBS gpm)
diff --git a/llvm/cmake/modules/FindFFI.cmake b/llvm/cmake/modules/FindFFI.cmake
index a493a89d630171f..c9ba104601872eb 100644
--- a/llvm/cmake/modules/FindFFI.cmake
+++ b/llvm/cmake/modules/FindFFI.cmake
@@ -38,15 +38,27 @@ find_library(FFI_LIBRARIES ffi PATHS ${FFI_LIBRARY_DIR})
 
 if(FFI_LIBRARIES)
   include(CMakePushCheckState)
-  include(CheckCSourceCompiles)
   cmake_push_check_state()
   list(APPEND CMAKE_REQUIRED_LIBRARIES ${FFI_LIBRARIES})
-  check_c_source_compiles("
+  set(HAVE_FFI_CALL_SRC [=[
+    #ifdef __cplusplus
+    extern "C" {
+    #endif
     struct ffi_cif;
     typedef struct ffi_cif ffi_cif;
     void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue);
-    int main(void) { ffi_call(0, 0, 0, 0); }"
-    HAVE_FFI_CALL)
+    #ifdef __cplusplus
+    }
+    #endif
+    int main(void) { ffi_call(0, 0, 0, 0); }
+    ]=])
+  if(DEFINED CMAKE_C_COMPILER)
+    include(CheckCSourceCompiles)
+    check_c_source_compiles("${HAVE_FFI_CALL_SRC}" HAVE_FFI_CALL)
+  else()
+    include(CheckCXXSourceCompiles)
+    check_cxx_source_compiles("${HAVE_FFI_CALL_SRC}" HAVE_FFI_CALL)
+  endif()
   cmake_pop_check_state()
 endif()
 
diff --git a/llvm/cmake/modules/FindTerminfo.cmake b/llvm/cmake/modules/FindTerminfo.cmake
index eef1f95853eb27c..163af6697067710 100644
--- a/llvm/cmake/modules/FindTerminfo.cmake
+++ b/llvm/cmake/modules/FindTerminfo.cmake
@@ -15,13 +15,25 @@ find_library(Terminfo_LIBRARIES NAMES terminfo tinfo curses ncurses ncursesw)
 
 if(Terminfo_LIBRARIES)
   include(CMakePushCheckState)
-  include(CheckCSourceCompiles)
   cmake_push_check_state()
   list(APPEND CMAKE_REQUIRED_LIBRARIES ${Terminfo_LIBRARIES})
-  check_c_source_compiles("
+  set(Terminfo_LINKABLE_SRC [=[
+    #ifdef __cplusplus
+    extern "C" {
+    #endif
     int setupterm(char *term, int filedes, int *errret);
-    int main(void) { return setupterm(0, 0, 0); }"
-    Terminfo_LINKABLE)
+    #ifdef __cplusplus
+    }
+    #endif
+    int main(void) { return setupterm(0, 0, 0); }
+    ]=])
+  if(DEFINED CMAKE_C_COMPILER)
+    include(CheckCSourceCompiles)
+    check_c_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
+  else()
+    include(CheckCXXSourceCompiles)
+    check_cxx_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
+  endif()
   cmake_pop_check_state()
 endif()
 
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 70be73abef19d6d..5742f8594e99908 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1085,6 +1085,15 @@ The integer codes are mapped to well-known attributes as follows.
 * code 77: ``elementtype``
 * code 78: ``disable_sanitizer_instrumentation``
 * code 79: ``nosanitize_bounds``
+* code 80: ``allocalign``
+* code 81: ``allocptr``
+* code 82: ``allockind``
+* code 83: ``presplitcoroutine``
+* code 84: ``fn_ret_thunk_extern``
+* code 85: ``skipprofile``
+* code 86: ``memory``
+* code 87: ``nofpclass``
+* code 88: ``optdebug``
 
 .. note::
   The ``allocsize`` attribute has a special encoding for its arguments. Its two
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 2035091be5a6840..3c178aa789970f9 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1872,6 +1872,8 @@ example:
     passes make choices that keep the code size of this function as small
     as possible and perform optimizations that may sacrifice runtime
     performance in order to minimize the size of the generated code.
+    This attribute is incompatible with the ``optdebug`` and ``optnone``
+    attributes.
 ``naked``
     This attribute disables prologue / epilogue emission for the
     function. This can have very system-specific consequences.
@@ -2024,6 +2026,12 @@ example:
    Note: Comparing address of a global variable to ``null`` may still
    evaluate to false because of a limitation in querying this attribute inside
    constant expressions.
+``optdebug``
+    This attribute suggests that optimization passes and code generator passes
+    should make choices that try to preserve debug info without significantly
+    degrading runtime performance.
+    This attribute is incompatible with the ``minsize``, ``optsize``, and
+    ``optnone`` attributes.
 ``optforfuzzing``
     This attribute indicates that this function should be optimized
     for maximum fuzzing signal.
@@ -2033,7 +2041,7 @@ example:
     Code generation defaults to the "fast" instruction selector.
     This attribute cannot be used together with the ``alwaysinline``
     attribute; this attribute is also incompatible
-    with the ``minsize`` attribute and the ``optsize`` attribute.
+    with the ``minsize``, ``optsize``, and ``optdebug`` attributes.
 
     This attribute requires the ``noinline`` attribute to be specified on
     the function as well, so the function is never inlined into any caller.
@@ -2044,6 +2052,8 @@ example:
     passes make choices that keep the code size of this function low,
     and otherwise do optimizations specifically to reduce code size as
     long as they do not significantly impact runtime performance.
+    This attribute is incompatible with the ``optdebug`` and ``optnone``
+    attributes.
 ``"patchable-function"``
     This attribute tells the code generator that the code
     generated for this function needs to follow certain conventions that
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
index 79bb1f1c8f84266..96bccb6440d4aa7 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
@@ -94,14 +94,6 @@ use, in the form of "passes".
 LLVM Optimization Passes
 ========================
 
-.. warning::
-
-   Due to the transition to the new PassManager infrastructure this tutorial
-   is based on ``llvm::legacy::FunctionPassManager`` which can be found in
-   `LegacyPassManager.h <https://llvm.org/doxygen/classllvm_1_1legacy_1_1FunctionPassManager.html>`_.
-   For the purpose of the this tutorial the above should be used until
-   the pass manager transition is complete.
-
 LLVM provides many optimization passes, which do many different sorts of
 things and have different tradeoffs. Unlike other systems, LLVM doesn't
 hold to the mistaken notion that one set of optimizations is right for
@@ -127,44 +119,93 @@ in. If we wanted to make a "static Kaleidoscope compiler", we would use
 exactly the code we have now, except that we would defer running the
 optimizer until the entire file has been parsed.
 
+In addition to the distinction between function and module passes, passes can be
+divided into transform and analysis passes. Transform passes mutate the IR, and
+analysis passes compute information that other passes can use. In order to add
+a transform pass, all analysis passes it depends upon must be registered in
+advance.
+
 In order to get per-function optimizations going, we need to set up a
 `FunctionPassManager <../../WritingAnLLVMPass.html#what-passmanager-doesr>`_ to hold
 and organize the LLVM optimizations that we want to run. Once we have
 that, we can add a set of optimizations to run. We'll need a new
 FunctionPassManager for each module that we want to optimize, so we'll
-write a function to create and initialize both the module and pass manager
-for us:
+add to a function created in the previous chapter (``InitializeModule()``):
 
 .. code-block:: c++
 
-    void InitializeModuleAndPassManager(void) {
+    void InitializeModuleAndManagers(void) {
       // Open a new context and module.
-      TheModule = std::make_unique<Module>("my cool jit", *TheContext);
+      TheContext = std::make_unique<LLVMContext>();
+      TheModule = std::make_unique<Module>("KaleidoscopeJIT", *TheContext);
+      TheModule->setDataLayout(TheJIT->getDataLayout());
 
-      // Create a new pass manager attached to it.
-      TheFPM = std::make_unique<legacy::FunctionPassManager>(TheModule.get());
+      // Create a new builder for the module.
+      Builder = std::make_unique<IRBuilder<>>(*TheContext);
+
+      // Create new pass and analysis managers.
+      TheFPM = std::make_unique<FunctionPassManager>();
+      TheFAM = std::make_unique<FunctionAnalysisManager>();
+      TheMAM = std::make_unique<ModuleAnalysisManager>();
+      ThePIC = std::make_unique<PassInstrumentationCallbacks>();
+      TheSI = std::make_unique<StandardInstrumentations>(*TheContext,
+                                                        /*DebugLogging*/ true);
+      TheSI->registerCallbacks(*ThePIC, TheMAM.get());
+      ...
 
+After initializing the global module ``TheModule`` and the FunctionPassManager,
+we need to initialize other parts of the framework. The FunctionAnalysisManager
+and ModuleAnalysisManager allow us to add analysis passes that run across the
+function and the whole module, respectively. PassInstrumentationCallbacks
+and StandardInstrumentations are required for the pass instrumentation
+framework, which allows developers to customize what
+happens between passes.
+
+Once these managers are set up, we use a series of "addPass" calls to add a
+bunch of LLVM transform passes:
+
+.. code-block:: c++
+
+      // Add transform passes.
       // Do simple "peephole" optimizations and bit-twiddling optzns.
-      TheFPM->add(createInstructionCombiningPass());
+      TheFPM->addPass(InstCombinePass());
       // Reassociate expressions.
-      TheFPM->add(createReassociatePass());
+      TheFPM->addPass(ReassociatePass());
       // Eliminate Common SubExpressions.
-      TheFPM->add(createGVNPass());
+      TheFPM->addPass(GVNPass());
       // Simplify the control flow graph (deleting unreachable blocks, etc).
-      TheFPM->add(createCFGSimplificationPass());
-
-      TheFPM->doInitialization();
-    }
-
-This code initializes the global module ``TheModule``, and the function pass
-manager ``TheFPM``, which is attached to ``TheModule``. Once the pass manager is
-set up, we use a series of "add" calls to add a bunch of LLVM passes.
+      TheFPM->addPass(SimplifyCFGPass());
 
 In this case, we choose to add four optimization passes.
 The passes we choose here are a pretty standard set
 of "cleanup" optimizations that are useful for a wide variety of code. I won't
 delve into what they do but, believe me, they are a good starting place :).
 
+Next, we register the analysis passes used by the transform passes. This is
+generally done using ``PassBuilder::register...Analyses()``, but we'll do it
+manually to make clearer what's under the hood.
+
+.. code-block:: c++
+
+      // Register analysis passes used in these transform passes.
+      TheFAM->registerPass([&] { return AAManager(); });
+      TheFAM->registerPass([&] { return AssumptionAnalysis(); });
+      TheFAM->registerPass([&] { return DominatorTreeAnalysis(); });
+      TheFAM->registerPass([&] { return LoopAnalysis(); });
+      TheFAM->registerPass([&] { return MemoryDependenceAnalysis(); });
+      TheFAM->registerPass([&] { return MemorySSAAnalysis(); });
+      TheFAM->registerPass([&] { return OptimizationRemarkEmitterAnalysis(); });
+      TheFAM->registerPass([&] {
+        return OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>(*TheMAM);
+      });
+      TheFAM->registerPass(
+          [&] { return PassInstrumentationAnalysis(ThePIC.get()); });
+      TheFAM->registerPass([&] { return TargetIRAnalysis(); });
+      TheFAM->registerPass([&] { return TargetLibraryAnalysis(); });
+
+      TheMAM->registerPass([&] { return ProfileSummaryAnalysis(); });
+    }
+
 Once the PassManager is set up, we need to make use of it. We do this by
 running it after our newly created function is constructed (in
 ``FunctionAST::codegen()``), but before it is returned to the client:
@@ -179,7 +220,7 @@ running it after our newly created function is constructed (in
         verifyFunction(*TheFunction);
 
         // Optimize the function.
-        TheFPM->run(*TheFunction);
+        TheFPM->run(*TheFunction, *TheFAM);
 
         return TheFunction;
       }
diff --git a/llvm/examples/Kaleidoscope/Chapter4/toy.cpp b/llvm/examples/Kaleidoscope/Chapter4/toy.cpp
index fb443c7f1514b28..19ec70efd5e1553 100644
--- a/llvm/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -1,21 +1,32 @@
 #include "../include/KaleidoscopeJIT.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -413,8 +424,12 @@ static std::unique_ptr<LLVMContext> TheContext;
 static std::unique_ptr<Module> TheModule;
 static std::unique_ptr<IRBuilder<>> Builder;
 static std::map<std::string, Value *> NamedValues;
-static std::unique_ptr<legacy::FunctionPassManager> TheFPM;
 static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static std::unique_ptr<FunctionPassManager> TheFPM;
+static std::unique_ptr<FunctionAnalysisManager> TheFAM;
+static std::unique_ptr<ModuleAnalysisManager> TheMAM;
+static std::unique_ptr<PassInstrumentationCallbacks> ThePIC;
+static std::unique_ptr<StandardInstrumentations> TheSI;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
 static ExitOnError ExitOnErr;
 
@@ -535,7 +550,7 @@ Function *FunctionAST::codegen() {
     verifyFunction(*TheFunction);
 
     // Run the optimizer on the function.
-    TheFPM->run(*TheFunction);
+    TheFPM->run(*TheFunction, *TheFAM);
 
     return TheFunction;
   }
@@ -549,28 +564,51 @@ Function *FunctionAST::codegen() {
 // Top-Level parsing and JIT Driver
 //===----------------------------------------------------------------------===//
 
-static void InitializeModuleAndPassManager() {
+static void InitializeModuleAndManagers() {
   // Open a new context and module.
   TheContext = std::make_unique<LLVMContext>();
-  TheModule = std::make_unique<Module>("my cool jit", *TheContext);
+  TheModule = std::make_unique<Module>("KaleidoscopeJIT", *TheContext);
   TheModule->setDataLayout(TheJIT->getDataLayout());
 
   // Create a new builder for the module.
   Builder = std::make_unique<IRBuilder<>>(*TheContext);
 
-  // Create a new pass manager attached to it.
-  TheFPM = std::make_unique<legacy::FunctionPassManager>(TheModule.get());
+  // Create new pass and analysis managers.
+  TheFPM = std::make_unique<FunctionPassManager>();
+  TheFAM = std::make_unique<FunctionAnalysisManager>();
+  TheMAM = std::make_unique<ModuleAnalysisManager>();
+  ThePIC = std::make_unique<PassInstrumentationCallbacks>();
+  TheSI = std::make_unique<StandardInstrumentations>(*TheContext,
+                                                     /*DebugLogging*/ true);
+  TheSI->registerCallbacks(*ThePIC, TheMAM.get());
 
+  // Add transform passes.
   // Do simple "peephole" optimizations and bit-twiddling optzns.
-  TheFPM->add(createInstructionCombiningPass());
+  TheFPM->addPass(InstCombinePass());
   // Reassociate expressions.
-  TheFPM->add(createReassociatePass());
+  TheFPM->addPass(ReassociatePass());
   // Eliminate Common SubExpressions.
-  TheFPM->add(createGVNPass());
+  TheFPM->addPass(GVNPass());
   // Simplify the control flow graph (deleting unreachable blocks, etc).
-  TheFPM->add(createCFGSimplificationPass());
-
-  TheFPM->doInitialization();
+  TheFPM->addPass(SimplifyCFGPass());
+
+  // Register analysis passes used in these transform passes.
+  TheFAM->registerPass([&] { return AAManager(); });
+  TheFAM->registerPass([&] { return AssumptionAnalysis(); });
+  TheFAM->registerPass([&] { return DominatorTreeAnalysis(); });
+  TheFAM->registerPass([&] { return LoopAnalysis(); });
+  TheFAM->registerPass([&] { return MemoryDependenceAnalysis(); });
+  TheFAM->registerPass([&] { return MemorySSAAnalysis(); });
+  TheFAM->registerPass([&] { return OptimizationRemarkEmitterAnalysis(); });
+  TheFAM->registerPass([&] {
+    return OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>(*TheMAM);
+  });
+  TheFAM->registerPass(
+      [&] { return PassInstrumentationAnalysis(ThePIC.get()); });
+  TheFAM->registerPass([&] { return TargetIRAnalysis(); });
+  TheFAM->registerPass([&] { return TargetLibraryAnalysis(); });
+
+  TheMAM->registerPass([&] { return ProfileSummaryAnalysis(); });
 }
 
 static void HandleDefinition() {
@@ -581,7 +619,7 @@ static void HandleDefinition() {
       fprintf(stderr, "\n");
       ExitOnErr(TheJIT->addModule(
           ThreadSafeModule(std::move(TheModule), std::move(TheContext))));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
     }
   } else {
     // Skip token for error recovery.
@@ -613,7 +651,7 @@ static void HandleTopLevelExpression() {
 
       auto TSM = ThreadSafeModule(std::move(TheModule), std::move(TheContext));
       ExitOnErr(TheJIT->addModule(std::move(TSM), RT));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
 
       // Search the JIT for the __anon_expr symbol.
       auto ExprSymbol = ExitOnErr(TheJIT->lookup("__anon_expr"));
@@ -699,7 +737,7 @@ int main() {
 
   TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
 
-  InitializeModuleAndPassManager();
+  InitializeModuleAndManagers();
 
   // Run the main "interpreter loop" now.
   MainLoop();
diff --git a/llvm/examples/Kaleidoscope/Chapter5/toy.cpp b/llvm/examples/Kaleidoscope/Chapter5/toy.cpp
index dc7174aa1c4b3a1..f41f08de51de095 100644
--- a/llvm/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -1,6 +1,13 @@
 #include "../include/KaleidoscopeJIT.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -8,15 +15,19 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -540,8 +551,12 @@ static std::unique_ptr<LLVMContext> TheContext;
 static std::unique_ptr<Module> TheModule;
 static std::unique_ptr<IRBuilder<>> Builder;
 static std::map<std::string, Value *> NamedValues;
-static std::unique_ptr<legacy::FunctionPassManager> TheFPM;
 static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static std::unique_ptr<FunctionPassManager> TheFPM;
+static std::unique_ptr<FunctionAnalysisManager> TheFAM;
+static std::unique_ptr<ModuleAnalysisManager> TheMAM;
+static std::unique_ptr<PassInstrumentationCallbacks> ThePIC;
+static std::unique_ptr<StandardInstrumentations> TheSI;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
 static ExitOnError ExitOnErr;
 
@@ -809,7 +824,7 @@ Function *FunctionAST::codegen() {
     verifyFunction(*TheFunction);
 
     // Run the optimizer on the function.
-    TheFPM->run(*TheFunction);
+    TheFPM->run(*TheFunction, *TheFAM);
 
     return TheFunction;
   }
@@ -823,28 +838,51 @@ Function *FunctionAST::codegen() {
 // Top-Level parsing and JIT Driver
 //===----------------------------------------------------------------------===//
 
-static void InitializeModuleAndPassManager() {
-  // Open a new module.
+static void InitializeModuleAndManagers() {
+  // Open a new context and module.
   TheContext = std::make_unique<LLVMContext>();
-  TheModule = std::make_unique<Module>("my cool jit", *TheContext);
+  TheModule = std::make_unique<Module>("KaleidoscopeJIT", *TheContext);
   TheModule->setDataLayout(TheJIT->getDataLayout());
 
   // Create a new builder for the module.
   Builder = std::make_unique<IRBuilder<>>(*TheContext);
 
-  // Create a new pass manager attached to it.
-  TheFPM = std::make_unique<legacy::FunctionPassManager>(TheModule.get());
+  // Create new pass and analysis managers.
+  TheFPM = std::make_unique<FunctionPassManager>();
+  TheFAM = std::make_unique<FunctionAnalysisManager>();
+  TheMAM = std::make_unique<ModuleAnalysisManager>();
+  ThePIC = std::make_unique<PassInstrumentationCallbacks>();
+  TheSI = std::make_unique<StandardInstrumentations>(*TheContext,
+                                                     /*DebugLogging*/ true);
+  TheSI->registerCallbacks(*ThePIC, TheMAM.get());
 
+  // Add transform passes.
   // Do simple "peephole" optimizations and bit-twiddling optzns.
-  TheFPM->add(createInstructionCombiningPass());
+  TheFPM->addPass(InstCombinePass());
   // Reassociate expressions.
-  TheFPM->add(createReassociatePass());
+  TheFPM->addPass(ReassociatePass());
   // Eliminate Common SubExpressions.
-  TheFPM->add(createGVNPass());
+  TheFPM->addPass(GVNPass());
   // Simplify the control flow graph (deleting unreachable blocks, etc).
-  TheFPM->add(createCFGSimplificationPass());
-
-  TheFPM->doInitialization();
+  TheFPM->addPass(SimplifyCFGPass());
+
+  // Register analysis passes used in these transform passes.
+  TheFAM->registerPass([&] { return AAManager(); });
+  TheFAM->registerPass([&] { return AssumptionAnalysis(); });
+  TheFAM->registerPass([&] { return DominatorTreeAnalysis(); });
+  TheFAM->registerPass([&] { return LoopAnalysis(); });
+  TheFAM->registerPass([&] { return MemoryDependenceAnalysis(); });
+  TheFAM->registerPass([&] { return MemorySSAAnalysis(); });
+  TheFAM->registerPass([&] { return OptimizationRemarkEmitterAnalysis(); });
+  TheFAM->registerPass([&] {
+    return OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>(*TheMAM);
+  });
+  TheFAM->registerPass(
+      [&] { return PassInstrumentationAnalysis(ThePIC.get()); });
+  TheFAM->registerPass([&] { return TargetIRAnalysis(); });
+  TheFAM->registerPass([&] { return TargetLibraryAnalysis(); });
+
+  TheMAM->registerPass([&] { return ProfileSummaryAnalysis(); });
 }
 
 static void HandleDefinition() {
@@ -855,7 +893,7 @@ static void HandleDefinition() {
       fprintf(stderr, "\n");
       ExitOnErr(TheJIT->addModule(
           ThreadSafeModule(std::move(TheModule), std::move(TheContext))));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
     }
   } else {
     // Skip token for error recovery.
@@ -887,7 +925,7 @@ static void HandleTopLevelExpression() {
 
       auto TSM = ThreadSafeModule(std::move(TheModule), std::move(TheContext));
       ExitOnErr(TheJIT->addModule(std::move(TSM), RT));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
 
       // Search the JIT for the __anon_expr symbol.
       auto ExprSymbol = ExitOnErr(TheJIT->lookup("__anon_expr"));
@@ -973,7 +1011,7 @@ int main() {
 
   TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
 
-  InitializeModuleAndPassManager();
+  InitializeModuleAndManagers();
 
   // Run the main "interpreter loop" now.
   MainLoop();
diff --git a/llvm/examples/Kaleidoscope/Chapter6/toy.cpp b/llvm/examples/Kaleidoscope/Chapter6/toy.cpp
index f40eea3c3a53d5c..ad275edc68a21d8 100644
--- a/llvm/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -1,6 +1,13 @@
 #include "../include/KaleidoscopeJIT.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -8,15 +15,19 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -632,8 +643,12 @@ static std::unique_ptr<LLVMContext> TheContext;
 static std::unique_ptr<Module> TheModule;
 static std::unique_ptr<IRBuilder<>> Builder;
 static std::map<std::string, Value *> NamedValues;
-static std::unique_ptr<legacy::FunctionPassManager> TheFPM;
 static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static std::unique_ptr<FunctionPassManager> TheFPM;
+static std::unique_ptr<FunctionAnalysisManager> TheFAM;
+static std::unique_ptr<ModuleAnalysisManager> TheMAM;
+static std::unique_ptr<PassInstrumentationCallbacks> ThePIC;
+static std::unique_ptr<StandardInstrumentations> TheSI;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
 static ExitOnError ExitOnErr;
 
@@ -925,7 +940,7 @@ Function *FunctionAST::codegen() {
     verifyFunction(*TheFunction);
 
     // Run the optimizer on the function.
-    TheFPM->run(*TheFunction);
+    TheFPM->run(*TheFunction, *TheFAM);
 
     return TheFunction;
   }
@@ -942,28 +957,51 @@ Function *FunctionAST::codegen() {
 // Top-Level parsing and JIT Driver
 //===----------------------------------------------------------------------===//
 
-static void InitializeModuleAndPassManager() {
-  // Open a new module.
+static void InitializeModuleAndManagers() {
+  // Open a new context and module.
   TheContext = std::make_unique<LLVMContext>();
-  TheModule = std::make_unique<Module>("my cool jit", *TheContext);
+  TheModule = std::make_unique<Module>("KaleidoscopeJIT", *TheContext);
   TheModule->setDataLayout(TheJIT->getDataLayout());
 
   // Create a new builder for the module.
   Builder = std::make_unique<IRBuilder<>>(*TheContext);
 
-  // Create a new pass manager attached to it.
-  TheFPM = std::make_unique<legacy::FunctionPassManager>(TheModule.get());
+  // Create new pass and analysis managers.
+  TheFPM = std::make_unique<FunctionPassManager>();
+  TheFAM = std::make_unique<FunctionAnalysisManager>();
+  TheMAM = std::make_unique<ModuleAnalysisManager>();
+  ThePIC = std::make_unique<PassInstrumentationCallbacks>();
+  TheSI = std::make_unique<StandardInstrumentations>(*TheContext,
+                                                     /*DebugLogging*/ true);
+  TheSI->registerCallbacks(*ThePIC, TheMAM.get());
 
+  // Add transform passes.
   // Do simple "peephole" optimizations and bit-twiddling optzns.
-  TheFPM->add(createInstructionCombiningPass());
+  TheFPM->addPass(InstCombinePass());
   // Reassociate expressions.
-  TheFPM->add(createReassociatePass());
+  TheFPM->addPass(ReassociatePass());
   // Eliminate Common SubExpressions.
-  TheFPM->add(createGVNPass());
+  TheFPM->addPass(GVNPass());
   // Simplify the control flow graph (deleting unreachable blocks, etc).
-  TheFPM->add(createCFGSimplificationPass());
-
-  TheFPM->doInitialization();
+  TheFPM->addPass(SimplifyCFGPass());
+
+  // Register analysis passes used in these transform passes.
+  TheFAM->registerPass([&] { return AAManager(); });
+  TheFAM->registerPass([&] { return AssumptionAnalysis(); });
+  TheFAM->registerPass([&] { return DominatorTreeAnalysis(); });
+  TheFAM->registerPass([&] { return LoopAnalysis(); });
+  TheFAM->registerPass([&] { return MemoryDependenceAnalysis(); });
+  TheFAM->registerPass([&] { return MemorySSAAnalysis(); });
+  TheFAM->registerPass([&] { return OptimizationRemarkEmitterAnalysis(); });
+  TheFAM->registerPass([&] {
+    return OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>(*TheMAM);
+  });
+  TheFAM->registerPass(
+      [&] { return PassInstrumentationAnalysis(ThePIC.get()); });
+  TheFAM->registerPass([&] { return TargetIRAnalysis(); });
+  TheFAM->registerPass([&] { return TargetLibraryAnalysis(); });
+
+  TheMAM->registerPass([&] { return ProfileSummaryAnalysis(); });
 }
 
 static void HandleDefinition() {
@@ -974,7 +1012,7 @@ static void HandleDefinition() {
       fprintf(stderr, "\n");
       ExitOnErr(TheJIT->addModule(
           ThreadSafeModule(std::move(TheModule), std::move(TheContext))));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
     }
   } else {
     // Skip token for error recovery.
@@ -1006,7 +1044,7 @@ static void HandleTopLevelExpression() {
 
       auto TSM = ThreadSafeModule(std::move(TheModule), std::move(TheContext));
       ExitOnErr(TheJIT->addModule(std::move(TSM), RT));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
 
       // Search the JIT for the __anon_expr symbol.
       auto ExprSymbol = ExitOnErr(TheJIT->lookup("__anon_expr"));
@@ -1092,7 +1130,7 @@ int main() {
 
   TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
 
-  InitializeModuleAndPassManager();
+  InitializeModuleAndManagers();
 
   // Run the main "interpreter loop" now.
   MainLoop();
diff --git a/llvm/examples/Kaleidoscope/Chapter7/toy.cpp b/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
index 5bbab8d563fb597..f2954a4cf1f2627 100644
--- a/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/llvm/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -1,6 +1,13 @@
 #include "../include/KaleidoscopeJIT.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -8,15 +15,19 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils.h"
 #include <algorithm>
 #include <cassert>
@@ -705,8 +716,12 @@ static std::unique_ptr<LLVMContext> TheContext;
 static std::unique_ptr<Module> TheModule;
 static std::unique_ptr<IRBuilder<>> Builder;
 static std::map<std::string, AllocaInst *> NamedValues;
-static std::unique_ptr<legacy::FunctionPassManager> TheFPM;
 static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static std::unique_ptr<FunctionPassManager> TheFPM;
+static std::unique_ptr<FunctionAnalysisManager> TheFAM;
+static std::unique_ptr<ModuleAnalysisManager> TheMAM;
+static std::unique_ptr<PassInstrumentationCallbacks> ThePIC;
+static std::unique_ptr<StandardInstrumentations> TheSI;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
 static ExitOnError ExitOnErr;
 
@@ -1094,7 +1109,7 @@ Function *FunctionAST::codegen() {
     verifyFunction(*TheFunction);
 
     // Run the optimizer on the function.
-    TheFPM->run(*TheFunction);
+    TheFPM->run(*TheFunction, *TheFAM);
 
     return TheFunction;
   }
@@ -1111,30 +1126,51 @@ Function *FunctionAST::codegen() {
 // Top-Level parsing and JIT Driver
 //===----------------------------------------------------------------------===//
 
-static void InitializeModuleAndPassManager() {
-  // Open a new module.
+static void InitializeModuleAndManagers() {
+  // Open a new context and module.
   TheContext = std::make_unique<LLVMContext>();
-  TheModule = std::make_unique<Module>("my cool jit", *TheContext);
+  TheModule = std::make_unique<Module>("KaleidoscopeJIT", *TheContext);
   TheModule->setDataLayout(TheJIT->getDataLayout());
 
   // Create a new builder for the module.
   Builder = std::make_unique<IRBuilder<>>(*TheContext);
 
-  // Create a new pass manager attached to it.
-  TheFPM = std::make_unique<legacy::FunctionPassManager>(TheModule.get());
+  // Create new pass and analysis managers.
+  TheFPM = std::make_unique<FunctionPassManager>();
+  TheFAM = std::make_unique<FunctionAnalysisManager>();
+  TheMAM = std::make_unique<ModuleAnalysisManager>();
+  ThePIC = std::make_unique<PassInstrumentationCallbacks>();
+  TheSI = std::make_unique<StandardInstrumentations>(*TheContext,
+                                                     /*DebugLogging*/ true);
+  TheSI->registerCallbacks(*ThePIC, TheMAM.get());
 
-  // Promote allocas to registers.
-  TheFPM->add(createPromoteMemoryToRegisterPass());
+  // Add transform passes.
   // Do simple "peephole" optimizations and bit-twiddling optzns.
-  TheFPM->add(createInstructionCombiningPass());
+  TheFPM->addPass(InstCombinePass());
   // Reassociate expressions.
-  TheFPM->add(createReassociatePass());
+  TheFPM->addPass(ReassociatePass());
   // Eliminate Common SubExpressions.
-  TheFPM->add(createGVNPass());
+  TheFPM->addPass(GVNPass());
   // Simplify the control flow graph (deleting unreachable blocks, etc).
-  TheFPM->add(createCFGSimplificationPass());
-
-  TheFPM->doInitialization();
+  TheFPM->addPass(SimplifyCFGPass());
+
+  // Register analysis passes used in these transform passes.
+  TheFAM->registerPass([&] { return AAManager(); });
+  TheFAM->registerPass([&] { return AssumptionAnalysis(); });
+  TheFAM->registerPass([&] { return DominatorTreeAnalysis(); });
+  TheFAM->registerPass([&] { return LoopAnalysis(); });
+  TheFAM->registerPass([&] { return MemoryDependenceAnalysis(); });
+  TheFAM->registerPass([&] { return MemorySSAAnalysis(); });
+  TheFAM->registerPass([&] { return OptimizationRemarkEmitterAnalysis(); });
+  TheFAM->registerPass([&] {
+    return OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>(*TheMAM);
+  });
+  TheFAM->registerPass(
+      [&] { return PassInstrumentationAnalysis(ThePIC.get()); });
+  TheFAM->registerPass([&] { return TargetIRAnalysis(); });
+  TheFAM->registerPass([&] { return TargetLibraryAnalysis(); });
+
+  TheMAM->registerPass([&] { return ProfileSummaryAnalysis(); });
 }
 
 static void HandleDefinition() {
@@ -1145,7 +1181,7 @@ static void HandleDefinition() {
       fprintf(stderr, "\n");
       ExitOnErr(TheJIT->addModule(
           ThreadSafeModule(std::move(TheModule), std::move(TheContext))));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
     }
   } else {
     // Skip token for error recovery.
@@ -1177,7 +1213,7 @@ static void HandleTopLevelExpression() {
 
       auto TSM = ThreadSafeModule(std::move(TheModule), std::move(TheContext));
       ExitOnErr(TheJIT->addModule(std::move(TSM), RT));
-      InitializeModuleAndPassManager();
+      InitializeModuleAndManagers();
 
       // Search the JIT for the __anon_expr symbol.
       auto ExprSymbol = ExitOnErr(TheJIT->lookup("__anon_expr"));
@@ -1264,7 +1300,7 @@ int main() {
 
   TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
 
-  InitializeModuleAndPassManager();
+  InitializeModuleAndManagers();
 
   // Run the main "interpreter loop" now.
   MainLoop();
diff --git a/llvm/include/llvm/Analysis/WithCache.h b/llvm/include/llvm/Analysis/WithCache.h
index 8065c45738f840b..7bd680bf097af1f 100644
--- a/llvm/include/llvm/Analysis/WithCache.h
+++ b/llvm/include/llvm/Analysis/WithCache.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ANALYSIS_WITHCACHE_H
 #define LLVM_ANALYSIS_WITHCACHE_H
 
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/KnownBits.h"
 #include <type_traits>
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 52e76356a892e45..5d7be5ca936ad37 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -713,6 +713,7 @@ enum AttributeKindCodes {
   ATTR_KIND_SKIP_PROFILE = 85,
   ATTR_KIND_MEMORY = 86,
   ATTR_KIND_NOFPCLASS = 87,
+  ATTR_KIND_OPTIMIZE_FOR_DEBUGGING = 88,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6ee85552bfbc67c..03b668d8023b844 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 477871
+#define LLVM_MAIN_REVISION 477905
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 3c0770787463e6c..7084081ce61a43a 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -79,8 +79,8 @@ class DWARFUnitHeader {
   /// Note that \p SectionKind is used as a hint to guess the unit type
   /// for DWARF formats prior to DWARFv5. In DWARFv5 the unit type is
   /// explicitly defined in the header and the hint is ignored.
-  bool extract(DWARFContext &Context, const DWARFDataExtractor &debug_info,
-               uint64_t *offset_ptr, DWARFSectionKind SectionKind);
+  Error extract(DWARFContext &Context, const DWARFDataExtractor &debug_info,
+                uint64_t *offset_ptr, DWARFSectionKind SectionKind);
   // For units in DWARF Package File, remember the index entry and update
   // the abbreviation offset read by extract().
   bool applyIndexEntry(const DWARFUnitIndex::Entry *Entry);
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index aba1d718f7f72f9..fda79f5f24495fb 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -200,6 +200,9 @@ def NoSanitizeCoverage : EnumAttr<"nosanitize_coverage", [FnAttr]>;
 /// Null pointer in address space zero is valid.
 def NullPointerIsValid : EnumAttr<"null_pointer_is_valid", [FnAttr]>;
 
+/// Select optimizations that give decent debug info.
+def OptimizeForDebugging : EnumAttr<"optdebug", [FnAttr]>;
+
 /// Select optimizations for best fuzzing signal.
 def OptForFuzzing : EnumAttr<"optforfuzzing", [FnAttr]>;
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 1d1ec988a93d847..16eafa6e18f5d59 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1980,6 +1980,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoSanitizeCoverage;
   case bitc::ATTR_KIND_NULL_POINTER_IS_VALID:
     return Attribute::NullPointerIsValid;
+  case bitc::ATTR_KIND_OPTIMIZE_FOR_DEBUGGING:
+    return Attribute::OptimizeForDebugging;
   case bitc::ATTR_KIND_OPT_FOR_FUZZING:
     return Attribute::OptForFuzzing;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index e991d055f33474b..c427459508ecfc8 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -747,6 +747,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_SANITIZE_COVERAGE;
   case Attribute::NullPointerIsValid:
     return bitc::ATTR_KIND_NULL_POINTER_IS_VALID;
+  case Attribute::OptimizeForDebugging:
+    return bitc::ATTR_KIND_OPTIMIZE_FOR_DEBUGGING;
   case Attribute::OptForFuzzing:
     return bitc::ATTR_KIND_OPT_FOR_FUZZING;
   case Attribute::OptimizeForSize:
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 372897835cce199..724f816ad094a75 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -89,9 +89,12 @@ void fixupIndexV4(DWARFContext &C, DWARFUnitIndex &Index) {
     DWARFDataExtractor Data(DObj, S, C.isLittleEndian(), 0);
     while (Data.isValidOffset(Offset)) {
       DWARFUnitHeader Header;
-      if (!Header.extract(C, Data, &Offset, DWARFSectionKind::DW_SECT_INFO)) {
+      if (Error ExtractionErr = Header.extract(
+              C, Data, &Offset, DWARFSectionKind::DW_SECT_INFO)) {
         logAllUnhandledErrors(
-            createError("Failed to parse CU header in DWP file"), errs());
+            createError("Failed to parse CU header in DWP file: " +
+                        toString(std::move(ExtractionErr))),
+            errs());
         Map.clear();
         break;
       }
@@ -149,9 +152,12 @@ void fixupIndexV5(DWARFContext &C, DWARFUnitIndex &Index) {
     uint64_t Offset = 0;
     while (Data.isValidOffset(Offset)) {
       DWARFUnitHeader Header;
-      if (!Header.extract(C, Data, &Offset, DWARFSectionKind::DW_SECT_INFO)) {
+      if (Error ExtractionErr = Header.extract(
+              C, Data, &Offset, DWARFSectionKind::DW_SECT_INFO)) {
         logAllUnhandledErrors(
-            createError("Failed to parse unit header in DWP file"), errs());
+            createError("Failed to parse CU header in DWP file: " +
+                        toString(std::move(ExtractionErr))),
+            errs());
         break;
       }
       bool CU = Header.getUnitType() == DW_UT_split_compile;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 0cd45bde3e25349..9f455fa7e96a7ef 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -81,8 +81,11 @@ void DWARFUnitVector::addUnitsImpl(
       if (!Data.isValidOffset(Offset))
         return nullptr;
       DWARFUnitHeader Header;
-      if (!Header.extract(Context, Data, &Offset, SectionKind))
+      if (Error ExtractErr =
+              Header.extract(Context, Data, &Offset, SectionKind)) {
+        Context.getWarningHandler()(std::move(ExtractErr));
         return nullptr;
+      }
       if (!IndexEntry && IsDWO) {
         const DWARFUnitIndex &Index = getDWARFUnitIndex(
             Context, Header.isTypeUnit() ? DW_SECT_EXT_TYPES : DW_SECT_INFO);
@@ -244,10 +247,10 @@ Expected<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   return DA.getRelocatedValue(ItemSize, &Offset);
 }
 
-bool DWARFUnitHeader::extract(DWARFContext &Context,
-                              const DWARFDataExtractor &debug_info,
-                              uint64_t *offset_ptr,
-                              DWARFSectionKind SectionKind) {
+Error DWARFUnitHeader::extract(DWARFContext &Context,
+                               const DWARFDataExtractor &debug_info,
+                               uint64_t *offset_ptr,
+                               DWARFSectionKind SectionKind) {
   Offset = *offset_ptr;
   Error Err = Error::success();
   IndexEntry = nullptr;
@@ -277,72 +280,58 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   } else if (UnitType == DW_UT_split_compile || UnitType == DW_UT_skeleton)
     DWOId = debug_info.getU64(offset_ptr, &Err);
 
-  if (Err) {
-    Context.getWarningHandler()(joinErrors(
+  if (Err)
+    return joinErrors(
         createStringError(
             errc::invalid_argument,
             "DWARF unit at 0x%8.8" PRIx64 " cannot be parsed:", Offset),
-        std::move(Err)));
-    return false;
-  }
+        std::move(Err));
 
   // Header fields all parsed, capture the size of this unit header.
   assert(*offset_ptr - Offset <= 255 && "unexpected header size");
   Size = uint8_t(*offset_ptr - Offset);
   uint64_t NextCUOffset = Offset + getUnitLengthFieldByteSize() + getLength();
 
-  if (!debug_info.isValidOffset(getNextUnitOffset() - 1)) {
-    Context.getWarningHandler()(
-        createStringError(errc::invalid_argument,
-                          "DWARF unit from offset 0x%8.8" PRIx64 " incl. "
-                          "to offset  0x%8.8" PRIx64 " excl. "
-                          "extends past section size 0x%8.8zx",
-                          Offset, NextCUOffset, debug_info.size()));
-    return false;
-  }
+  if (!debug_info.isValidOffset(getNextUnitOffset() - 1))
+    return createStringError(errc::invalid_argument,
+                             "DWARF unit from offset 0x%8.8" PRIx64 " incl. "
+                             "to offset  0x%8.8" PRIx64 " excl. "
+                             "extends past section size 0x%8.8zx",
+                             Offset, NextCUOffset, debug_info.size());
 
-  if (!DWARFContext::isSupportedVersion(getVersion())) {
-    Context.getWarningHandler()(createStringError(
+  if (!DWARFContext::isSupportedVersion(getVersion()))
+    return createStringError(
         errc::invalid_argument,
         "DWARF unit at offset 0x%8.8" PRIx64 " "
         "has unsupported version %" PRIu16 ", supported are 2-%u",
-        Offset, getVersion(), DWARFContext::getMaxSupportedVersion()));
-    return false;
-  }
+        Offset, getVersion(), DWARFContext::getMaxSupportedVersion());
 
   // Type offset is unit-relative; should be after the header and before
   // the end of the current unit.
-  if (isTypeUnit() && TypeOffset < Size) {
-    Context.getWarningHandler()(
-        createStringError(errc::invalid_argument,
-                          "DWARF type unit at offset "
-                          "0x%8.8" PRIx64 " "
-                          "has its relocated type_offset 0x%8.8" PRIx64 " "
-                          "pointing inside the header",
-                          Offset, Offset + TypeOffset));
-    return false;
-  }
-  if (isTypeUnit() &&
-      TypeOffset >= getUnitLengthFieldByteSize() + getLength()) {
-    Context.getWarningHandler()(createStringError(
+  if (isTypeUnit() && TypeOffset < Size)
+    return createStringError(errc::invalid_argument,
+                             "DWARF type unit at offset "
+                             "0x%8.8" PRIx64 " "
+                             "has its relocated type_offset 0x%8.8" PRIx64 " "
+                             "pointing inside the header",
+                             Offset, Offset + TypeOffset);
+
+  if (isTypeUnit() && TypeOffset >= getUnitLengthFieldByteSize() + getLength())
+    return createStringError(
         errc::invalid_argument,
         "DWARF type unit from offset 0x%8.8" PRIx64 " incl. "
         "to offset 0x%8.8" PRIx64 " excl. has its "
         "relocated type_offset 0x%8.8" PRIx64 " pointing past the unit end",
-        Offset, NextCUOffset, Offset + TypeOffset));
-    return false;
-  }
+        Offset, NextCUOffset, Offset + TypeOffset);
 
   if (Error SizeErr = DWARFContext::checkAddressSizeSupported(
           getAddressByteSize(), errc::invalid_argument,
-          "DWARF unit at offset 0x%8.8" PRIx64, Offset)) {
-    Context.getWarningHandler()(std::move(SizeErr));
-    return false;
-  }
+          "DWARF unit at offset 0x%8.8" PRIx64, Offset))
+    return SizeErr;
 
   // Keep track of the highest DWARF version we encounter across all units.
   Context.setMaxVersionIfGreater(getVersion());
-  return true;
+  return Error::success();
 }
 
 bool DWARFUnitHeader::applyIndexEntry(const DWARFUnitIndex::Entry *Entry) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 5a3328416db3eb0..3c4efd7e359c524 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2118,6 +2118,17 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
 
     Check(!Attrs.hasFnAttr(Attribute::MinSize),
           "Attributes 'minsize and optnone' are incompatible!", V);
+
+    Check(!Attrs.hasFnAttr(Attribute::OptimizeForDebugging),
+          "Attributes 'optdebug and optnone' are incompatible!", V);
+  }
+
+  if (Attrs.hasFnAttr(Attribute::OptimizeForDebugging)) {
+    Check(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
+          "Attributes 'optsize and optdebug' are incompatible!", V);
+
+    Check(!Attrs.hasFnAttr(Attribute::MinSize),
+          "Attributes 'minsize and optdebug' are incompatible!", V);
   }
 
   if (Attrs.hasFnAttr("aarch64_pstate_sm_enabled")) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 239d82979c91939..04fd66148cabb80 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5448,6 +5448,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
     return true;
   }
 
+  if (MO->isImm()) {
+    uint64_t Imm = MO->getImm();
+    bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
+    bool Is64BitOp = Is64BitFPOp ||
+                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
+                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
+                     OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+    if (Is64BitOp && !AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
+        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()))
+      return false;
+  }
+
   // Handle non-register types that are treated like immediates.
   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f88bd9b45aee601..3db777f904df0ad 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -3620,9 +3620,17 @@ SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
   int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
   SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
 
-  // FIXME The frontend should detect this case.
   if (Depth > 0) {
-    report_fatal_error("Unsupported stack frame traversal count");
+    // FIXME The frontend should detect this case.
+    if (!MF.getFunction().hasFnAttribute("backchain"))
+      report_fatal_error("Unsupported stack frame traversal count");
+
+    SDValue Offset = DAG.getConstant(TFL->getBackchainOffset(MF), DL, PtrVT);
+    while (Depth--) {
+      BackChain = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), BackChain,
+                              MachinePointerInfo());
+      BackChain = DAG.getNode(ISD::ADD, DL, PtrVT, BackChain, Offset);
+    }
   }
 
   return BackChain;
@@ -3641,9 +3649,19 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
-  // FIXME The frontend should detect this case.
   if (Depth > 0) {
-    report_fatal_error("Unsupported stack frame traversal count");
+    // FIXME The frontend should detect this case.
+    if (!MF.getFunction().hasFnAttribute("backchain"))
+      report_fatal_error("Unsupported stack frame traversal count");
+
+    SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
+    auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
+    int Offset = (TFL->usePackedStack(MF) ? -2 : 14) *
+                 getTargetMachine().getPointerSize(0);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, FrameAddr,
+                              DAG.getConstant(Offset, DL, PtrVT));
+    return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr,
+                       MachinePointerInfo());
   }
 
   // Return R14D, which has the return address. Mark it an implicit live-in.
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 08b2b01b2ee1e23..ae7ed296c45ea88 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -941,6 +941,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NoSanitizeBounds:
       case Attribute::NoSanitizeCoverage:
       case Attribute::NullPointerIsValid:
+      case Attribute::OptimizeForDebugging:
       case Attribute::OptForFuzzing:
       case Attribute::OptimizeNone:
       case Attribute::OptimizeForSize:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d09bf3872f04f06..76701dba5840af2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9043,6 +9043,7 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
   // blocks.
   if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
     TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
+    TEInsertPt = TEInsertBlock->getTerminator();
   } else {
     TEInsertBlock = TEInsertPt->getParent();
   }
@@ -9106,9 +9107,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
       const Instruction *InsertPt =
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
-      if (!UserPHI && TEInsertPt == InsertPt) {
-        // If 2 gathers are operands of the same non-PHI entry,
-        // compare operands indices, use the earlier one as the base.
+      if (TEInsertPt == InsertPt) {
+        // If 2 gathers are operands of the same entry (regardless of wether
+        // user is PHI or else), compare operands indices, use the earlier one
+        // as the base.
         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
           continue;
         // If the user instruction is used for some reason in different
@@ -10129,7 +10131,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   inversePermutation(E->ReorderIndices, ReorderMask);
   if (!ReorderMask.empty())
     reorderScalars(GatheredScalars, ReorderMask);
-  auto FindReusedSplat = [&](SmallVectorImpl<int> &Mask) {
+  auto FindReusedSplat = [&](MutableArrayRef<int> Mask) {
     if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
           return isa<UndefValue>(V) && !isa<PoisonValue>(V);
         }))
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index 9af648fe262a351..eaf670575f4dd73 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -511,6 +511,12 @@ define void @f87() fn_ret_thunk_extern { ret void }
 ; CHECK: define void @f88() [[SKIPPROFILE:#[0-9]+]]
 define void @f88() skipprofile { ret void }
 
+define void @f89() optdebug
+; CHECK: define void @f89() [[OPTDEBUG:#[0-9]+]]
+{
+        ret void;
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { memory(none) }
@@ -566,4 +572,5 @@ define void @f88() skipprofile { ret void }
 ; CHECK: attributes #52 = { nosanitize_bounds }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
+; CHECK: attributes [[OPTDEBUG]] = { optdebug }
 ; CHECK: attributes #[[NOBUILTIN]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
new file mode 100644
index 000000000000000..328ee991da8f4a6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
@@ -0,0 +1,125 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-fold-operands -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name:            no_fold_fp_64bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fp_64bit_literal_sgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
+    ; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, [[S_MOV_B64_]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 1311768467750121200
+    %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            no_fold_fp_64bit_literal_vgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fp_64bit_literal_vgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, [[V_MOV_B]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            fold_fp_32bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_fp_32bit_literal_sgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = V_ADD_F64_e64 0, 4636737291354636288, 0, [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_ADD_F64_e64_]]
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 4636737291354636288
+    %2:vreg_64 = V_ADD_F64_e64 0, %1, 0, %0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            no_fold_int_64bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_int_64bit_literal_sgpr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
+    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], [[S_MOV_B64_]], implicit-def $scc
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 1311768467750121200
+    %2:sreg_64 = S_AND_B64 %0, %1, implicit-def $scc
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            fold_int_32bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_int_32bit_literal_sgpr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 2147483647, implicit-def $scc
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 2147483647
+    %2:sreg_64 = S_AND_B64 %0, %1, implicit-def $scc
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            fold_uint_32bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_uint_32bit_literal_sgpr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 4294967295, implicit-def $scc
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
+    %0:sreg_64 = IMPLICIT_DEF
+    %1:sreg_64 = S_MOV_B64 4294967295
+    %2:sreg_64 = S_AND_B64 %0, %1, implicit-def $scc
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            no_fold_v2fp_64bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64 = V_MOV_B64_PSEUDO 4629700418019000320, implicit $exec
+    %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %2
+...
+
+---
+name:            fold_v2fp_32bit_literal_sgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vreg_64 = V_MOV_B64_PSEUDO 1065353216, implicit $exec
+    %2:vreg_64 = V_PK_ADD_F32 0, %0, 0, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %2
+...
diff --git a/llvm/test/CodeGen/SystemZ/frameaddr-01.ll b/llvm/test/CodeGen/SystemZ/frameaddr-01.ll
index db3d7b33f94cae3..e8521883f08873c 100644
--- a/llvm/test/CodeGen/SystemZ/frameaddr-01.ll
+++ b/llvm/test/CodeGen/SystemZ/frameaddr-01.ll
@@ -25,4 +25,25 @@ entry:
   ret ptr %1
 }
 
+; Check the caller's frame address.
+define ptr @fpcaller() nounwind "backchain" {
+entry:
+; CHECK-LABEL: fpcaller:
+; CHECK: lg   %r2, 0(%r15)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.frameaddress(i32 1)
+  ret ptr %0
+}
+
+; Check the caller's frame address.
+define ptr @fpcallercaller() nounwind "backchain" {
+entry:
+; CHECK-LABEL: fpcallercaller:
+; CHECK: lg   %r1, 0(%r15)
+; CHECK: lg   %r2, 0(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.frameaddress(i32 2)
+  ret ptr %0
+}
+
 declare ptr @llvm.frameaddress(i32) nounwind readnone
diff --git a/llvm/test/CodeGen/SystemZ/frameaddr-02.ll b/llvm/test/CodeGen/SystemZ/frameaddr-02.ll
index a5e7e701c30e2d2..d3977928d41b87e 100644
--- a/llvm/test/CodeGen/SystemZ/frameaddr-02.ll
+++ b/llvm/test/CodeGen/SystemZ/frameaddr-02.ll
@@ -27,6 +27,29 @@ entry:
   ret ptr %1
 }
 
+; Check the caller's frame address.
+define ptr @fpcaller() #0 {
+entry:
+; CHECK-LABEL: fpcaller:
+; CHECK: lghi %r2, 152
+; CHECK: ag   %r2, 152(%r15)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.frameaddress(i32 1)
+  ret ptr %0
+}
+
+; Check the caller's caller's frame address.
+define ptr @fpcallercaller() #0 {
+entry:
+; CHECK-LABEL: fpcallercaller:
+; CHECK: lg   %r1, 152(%r15)
+; CHECK: lghi %r2, 152
+; CHECK: ag   %r2, 152(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.frameaddress(i32 2)
+  ret ptr %0
+}
+
 ; Without back chain
 
 attributes #1 = { nounwind "packed-stack" }
diff --git a/llvm/test/CodeGen/SystemZ/ret-addr-01.ll b/llvm/test/CodeGen/SystemZ/ret-addr-01.ll
index 8111d2a72456d59..e3644feb0b49b7c 100644
--- a/llvm/test/CodeGen/SystemZ/ret-addr-01.ll
+++ b/llvm/test/CodeGen/SystemZ/ret-addr-01.ll
@@ -12,4 +12,27 @@ entry:
   ret ptr %0
 }
 
+; Check the caller's return address.
+define ptr @rtcaller() nounwind "backchain" {
+entry:
+; CHECK-LABEL: rtcaller:
+; CHECK: lg   %r1, 0(%r15)
+; CHECK  lg   %r2, 112(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.returnaddress(i32 1)
+  ret ptr %0
+}
+
+; Check the caller's caller's return address.
+define ptr @rtcallercaller() nounwind "backchain" {
+entry:
+; CHECK-LABEL: rtcallercaller:
+; CHECK: lg   %r1, 0(%r15)
+; CHECK: lg   %r1, 0(%r1)
+; CHECK  lg   %r2, 112(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.returnaddress(i32 2)
+  ret ptr %0
+}
+
 declare ptr @llvm.returnaddress(i32) nounwind readnone
diff --git a/llvm/test/CodeGen/SystemZ/ret-addr-02.ll b/llvm/test/CodeGen/SystemZ/ret-addr-02.ll
new file mode 100644
index 000000000000000..5c1e56d56c0284e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/ret-addr-02.ll
@@ -0,0 +1,39 @@
+; Test support for the llvm.returnaddress intrinsic with packed-stack.
+
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; The current function's return address is in the link register.
+attributes #0 = { nounwind "packed-stack" "backchain" "use-soft-float"="true" }
+define ptr @rt0() #0 {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: lgr  %r2, %r14
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.returnaddress(i32 0)
+  ret ptr %0
+}
+
+; Check the caller's return address.
+define ptr @rtcaller() #0 {
+entry:
+; CHECK-LABEL: rtcaller:
+; CHECK: lg   %r1, 152(%r15)
+; CHECK  lg   %r2, 136(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.returnaddress(i32 1)
+  ret ptr %0
+}
+
+; Check the caller's caller's return address.
+define ptr @rtcallercaller() #0 {
+entry:
+; CHECK-LABEL: rtcallercaller:
+; CHECK: lg   %r1, 152(%r15)
+; CHECK: lg   %r1, 152(%r1)
+; CHECK  lg   %r2, 136(%r1)
+; CHECK: br   %r14
+  %0 = tail call ptr @llvm.returnaddress(i32 2)
+  ret ptr %0
+}
+
+declare ptr @llvm.returnaddress(i32) nounwind readnone
diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll
index dc918a7fc8080b3..29c757f82564ad8 100644
--- a/llvm/test/Transforms/InstCombine/malloc-free.ll
+++ b/llvm/test/Transforms/InstCombine/malloc-free.ll
@@ -26,6 +26,15 @@ define i32 @dead_aligned_alloc(i32 %size, i32 %alignment, i8 %value) {
   ret i32 0
 }
 
+define i1 @aligned_alloc_only_pointe(i32 %size, i32 %alignment, i8 %value) {
+; CHECK-LABEL: @aligned_alloc_only_pointe(
+; CHECK-NEXT:    ret i1 true
+;
+  %aligned_allocation = tail call ptr @aligned_alloc(i32 %alignment, i32 %size)
+  %cmp = icmp ne ptr %aligned_allocation, null
+  ret i1 %cmp
+}
+
 declare noalias ptr @calloc(i32, i32) nounwind allockind("alloc,zeroed") allocsize(0,1) "alloc-family"="malloc"
 declare noalias ptr @malloc(i32) allockind("alloc,uninitialized") allocsize(0) "alloc-family"="malloc"
 declare noalias ptr @aligned_alloc(i32, i32) allockind("alloc,uninitialized,aligned") allocsize(1) "alloc-family"="malloc"
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
index 28e0b06f6967368..e5d7ad138b4def2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-gather-nodes-phi-users.ll
@@ -8,7 +8,7 @@
 ; YAML: Function:        test
 ; YAML: Args:
 ; YAML:   - String:          'Stores SLP vectorized with cost '
-; YAML:   - Cost:            '-3'
+; YAML:   - Cost:            '-6'
 ; YAML:   - String:          ' and with tree size '
 ; YAML:   - TreeSize:        '14'
 ; YAML: ...
diff --git a/llvm/test/tools/llvm-dwp/X86/cu_tu_units_manual_v5_invalid.s b/llvm/test/tools/llvm-dwp/X86/cu_tu_units_manual_v5_invalid.s
new file mode 100644
index 000000000000000..d1ab9f75b74c8fa
--- /dev/null
+++ b/llvm/test/tools/llvm-dwp/X86/cu_tu_units_manual_v5_invalid.s
@@ -0,0 +1,73 @@
+# This test checks that llvm-dwarfdump correctly reports errors when parsing
+# DWARF Unit Headers in DWP files
+
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o \
+# RUN:         -split-dwarf-file=%t.dwo -dwarf-version=5
+# RUN: llvm-dwp %t.dwo -o %t.dwp
+# RUN: llvm-dwarfdump -debug-info -debug-cu-index -debug-tu-index \
+# RUN:                -manaully-generate-unit-index %t.dwp 2>&1 | FileCheck %s
+
+## Note: In order to check whether the type unit index is generated
+## there is no need to add the missing DIEs for the structure type of the type unit.
+
+# CHECK-NOT: .debug_info.dwo contents:
+
+# CHECK-DAG: .debug_cu_index contents:
+# CHECK: Failed to parse CU header in DWP file: DWARF unit at offset 0x00000000 has unsupported version 6, supported are 2-5
+
+# CHECK-DAG: .debug_tu_index contents:
+# CHECK: Failed to parse CU header in DWP file: DWARF unit at offset 0x00000000 has unsupported version 6, supported are 2-5
+
+    .section	.debug_info.dwo,"e",@progbits
+    .long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+    .short	6                               # DWARF version number
+    .byte	6                               # DWARF Unit Type (DW_UT_split_type)
+    .byte	8                               # Address Size (in bytes)
+    .long	0                               # Offset Into Abbrev. Section
+    .quad	5657452045627120676             # Type Signature
+    .long	25                              # Type DIE Offset
+    .byte	2                               # Abbrev [2] DW_TAG_type_unit
+    .byte	3                               # Abbrev [3] DW_TAG_structure_type
+    .byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+    .section	.debug_info.dwo,"e",@progbits
+    .long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+    .short	6                               # DWARF version number
+    .byte	6                               # DWARF Unit Type (DW_UT_split_type)
+    .byte	8                               # Address Size (in bytes)
+    .long	0                               # Offset Into Abbrev. Section
+    .quad	-8528522068957683993            # Type Signature
+    .long	25                              # Type DIE Offset
+    .byte	4                               # Abbrev [4] DW_TAG_type_unit
+    .byte	5                               # Abbrev [5] DW_TAG_structure_type
+    .byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+    .section	.debug_info.dwo,"e",@progbits
+    .long	.Ldebug_info_dwo_end2-.Ldebug_info_dwo_start2 # Length of Unit
+.Ldebug_info_dwo_start2:
+    .short	6                               # DWARF version number
+    .byte	5                               # DWARF Unit Type (DW_UT_split_compile)
+    .byte	8                               # Address Size (in bytes)
+    .long	0                               # Offset Into Abbrev. Section
+    .quad	1152943841751211454
+    .byte	1                               # Abbrev [1] DW_TAG_compile_unit
+.Ldebug_info_dwo_end2:
+    .section	.debug_abbrev.dwo,"e",@progbits
+    .byte	1                               # Abbreviation Code
+    .byte	17                              # DW_TAG_compile_unit
+    .byte	0                               # DW_CHILDREN_no
+    .byte	0                               # EOM(1)
+    .byte	0                               # EOM(2)
+    .byte	2                               # Abbreviation Code
+    .byte	65                              # DW_TAG_type_unit
+    .byte	1                               # DW_CHILDREN_yes
+    .byte	0                               # EOM
+    .byte	0                               # EOM
+    .byte	4                               # Abbreviation Code
+    .byte	65                              # DW_TAG_type_unit
+    .byte	1                               # DW_CHILDREN_yes
+    .byte	0                               # EOM
+    .byte	0                               # EOM
+    .byte	0                               # EOM
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 2adc2403eaca9e6..9f4fe9c54a928fd 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -2170,7 +2170,11 @@ TEST(DWARFDebugInfo, TestDWARF64UnitLength) {
     DWARFDataExtractor Data(Obj, Sec, /* IsLittleEndian = */ true,
                             /* AddressSize = */ 4);
     uint64_t Offset = 0;
-    EXPECT_FALSE(Header.extract(*Context, Data, &Offset, DW_SECT_INFO));
+    ASSERT_THAT_ERROR(
+        Header.extract(*Context, Data, &Offset, DW_SECT_INFO),
+        FailedWithMessage(
+            "DWARF unit from offset 0x00000000 incl. to offset  "
+            "0x1122334455667794 excl. extends past section size 0x00000018"));
     // Header.extract() returns false because there is not enough space
     // in the section for the declared length. Anyway, we can check that
     // the properties are read correctly.
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 5e822078b947123..f7a7172d61fc618 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1935,7 +1935,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   if (NumProcs)
     OS << Target << "SubTypeKV, ";
   else
-    OS << "None, ";
+    OS << "std::nullopt, ";
   OS << '\n'; OS.indent(22);
   OS << Target << "WriteProcResTable, "
      << Target << "WriteLatencyTable, "
@@ -2028,7 +2028,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   if (NumProcs)
     OS << "ArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), ";
   else
-    OS << "None, ";
+    OS << "std::nullopt, ";
   OS << '\n'; OS.indent(24);
   OS << Target << "WriteProcResTable, "
      << Target << "WriteLatencyTable, "
diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el
index e37cc693a1940aa..53381cf91b17b90 100644
--- a/llvm/utils/emacs/llvm-mode.el
+++ b/llvm/utils/emacs/llvm-mode.el
@@ -25,7 +25,7 @@
        '("alwaysinline" "argmemonly" "allocsize" "builtin" "cold" "convergent" "dereferenceable" "dereferenceable_or_null" "hot" "immarg" "inaccessiblememonly"
          "inaccessiblemem_or_argmemonly" "inalloca" "inlinehint" "jumptable" "minsize" "mustprogress" "naked" "nobuiltin" "nonnull" "nocapture"
          "nocallback" "nocf_check" "noduplicate" "nofree" "noimplicitfloat" "noinline" "nomerge" "nonlazybind" "noprofile" "noredzone" "noreturn"
-         "norecurse" "nosync" "noundef" "nounwind" "nosanitize_bounds" "nosanitize_coverage" "null_pointer_is_valid" "optforfuzzing" "optnone" "optsize" "preallocated" "readnone" "readonly" "returned" "returns_twice"
+         "norecurse" "nosync" "noundef" "nounwind" "nosanitize_bounds" "nosanitize_coverage" "null_pointer_is_valid" "optdebug" "optforfuzzing" "optnone" "optsize" "preallocated" "readnone" "readonly" "returned" "returns_twice"
          "shadowcallstack" "signext" "speculatable" "speculative_load_hardening" "ssp" "sspreq" "sspstrong" "safestack" "sanitize_address" "sanitize_hwaddress" "sanitize_memtag"
          "sanitize_thread" "sanitize_memory" "strictfp" "swifterror" "uwtable" "vscale_range" "willreturn" "writeonly" "zeroext") 'symbols) . font-lock-constant-face)
    ;; Variables
diff --git a/llvm/utils/kate/llvm.xml b/llvm/utils/kate/llvm.xml
index 9f7ec77bf3154d4..0e7aec3880e6b4a 100644
--- a/llvm/utils/kate/llvm.xml
+++ b/llvm/utils/kate/llvm.xml
@@ -111,6 +111,7 @@
       <item> nosync </item>
       <item> nounwind </item>
       <item> null_pointer_is_valid </item>
+      <item> optdebug </item>
       <item> optforfuzzing </item>
       <item> optnone </item>
       <item> optsize </item>
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index 9185a029a22e570..d86e3d1ddbc27ff 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -142,6 +142,7 @@ syn keyword llvmKeyword
       \ nosanitize_bounds
       \ nosanitize_coverage
       \ null_pointer_is_valid
+      \ optdebug
       \ optforfuzzing
       \ optnone
       \ optsize
diff --git a/mlir/include/mlir/Analysis/Presburger/Fraction.h b/mlir/include/mlir/Analysis/Presburger/Fraction.h
index a410f528e1f8001..afcbed84c66bc3c 100644
--- a/mlir/include/mlir/Analysis/Presburger/Fraction.h
+++ b/mlir/include/mlir/Analysis/Presburger/Fraction.h
@@ -38,7 +38,7 @@ struct Fraction {
     }
   }
   /// Overloads for passing literals.
-  Fraction(const MPInt &num, int64_t den = 1) : Fraction(num, MPInt(den)) {}
+  Fraction(const MPInt &num, int64_t den) : Fraction(num, MPInt(den)) {}
   Fraction(int64_t num, const MPInt &den = MPInt(1))
       : Fraction(MPInt(num), den) {}
   Fraction(int64_t num, int64_t den) : Fraction(MPInt(num), MPInt(den)) {}
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index b396b2c2c141307..f77b8cbbbc61d23 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -40,7 +40,8 @@ class CF_Op<string mnemonic, list<Trait> traits = []> :
 def AssertOp : CF_Op<"assert"> {
   let summary = "Assert operation with message attribute";
   let description = [{
-    Assert operation with single boolean operand and an error message attribute.
+    Assert operation at runtime with single boolean operand and an error
+    message attribute.
     If the argument is `true` this operation has no effect. Otherwise, the
     program execution will abort. The provided error message may be used by a
     runtime to propagate the error to the user.
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 9ca029b489ad144..44e82f452b3cef1 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -86,6 +86,39 @@ def LinalgContractionOpInterface : OpInterface<"ContractionOpInterface"> {
     /*methodBody=*/[{
         return mlir::isRowMajorBatchMatmul($_op.getIndexingMaps());
     }]>,
+    InterfaceMethod<
+    /*desc=*/[{
+      Returns whether the given op has indexing maps that correspond to a
+      vector-matrix multiplication.
+    }],
+    /*retTy=*/"bool",
+    /*methodName=*/"isVecmat",
+    /*args=*/(ins),
+    /*methodBody=*/[{
+        return mlir::isVecmat($_op.getIndexingMaps());
+    }]>,
+    InterfaceMethod<
+    /*desc=*/[{
+      Returns whether the given op has indexing maps that correspond to a
+      matrix-vector multiplication.
+    }],
+    /*retTy=*/"bool",
+    /*methodName=*/"isMatvec",
+    /*args=*/(ins),
+    /*methodBody=*/[{
+        return mlir::isMatvec($_op.getIndexingMaps());
+    }]>,
+    InterfaceMethod<
+    /*desc=*/[{
+      Returns whether the given op has indexing maps that correspond to a
+      batched matrix-vector multiplication.
+    }],
+    /*retTy=*/"bool",
+    /*methodName=*/"isBatchMatvec",
+    /*args=*/(ins),
+    /*methodBody=*/[{
+        return mlir::isBatchMatvec($_op.getIndexingMaps());
+    }]>,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index d761743a82bf86b..39d24595ec1c446 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -58,8 +58,8 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
   let parameters = (ins
     AttrParameter<"::mlir::SymbolRefAttr", "cluster placed">:$cluster,
-    ArrayRefParameter<"::mlir::DenseI8ArrayAttr">:$split_axes,
-    OptionalArrayRefParameter<"int8_t">:$partial_axes,
+    ArrayRefParameter<"::mlir::DenseI32ArrayAttr">:$split_axes,
+    OptionalArrayRefParameter<"int32_t">:$partial_axes,
     OptionalParameter<"::mlir::mesh::Partial">:$partial_type
   );
 
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
index 8ca4b6653104221..a8aa0a694bee29f 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -70,7 +70,7 @@ def Mesh_ClusterOp : Mesh_Op<"cluster", [Symbol]> {
   }];
   let arguments = (ins
     SymbolNameAttr:$sym_name,
-    I8Attr:$rank,
+    I64Attr:$rank,
     DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$dim_sizes
   );
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index dd00355b6d77e33..440f7d0380eb17e 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -522,8 +522,8 @@ def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
       nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
     ```
   }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers)";
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
 }
 
 def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
@@ -597,8 +597,8 @@ def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
       nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
     ```
   }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  attr-dict `:` type($barriers)";
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
 }
 
 def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
@@ -627,11 +627,11 @@ def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
   }];
   let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
   let assemblyFormat = [{
-    $tensorMapDescriptor (`,` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
+    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
   }];
 }
 
-def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
+def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
   let summary = "TMA asynchronous load";
   let description = [{
     The Op loads a tile memory region from global memory to shared memory by 
@@ -646,10 +646,14 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
                        NVGPU_MBarrierGroup:$barriers,
                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
                        Variadic<Index>:$coordinates, 
-                       Index:$mbarId);
+                       Index:$mbarId,
+                       Optional<I1>:$predicate);
   let assemblyFormat = [{
-    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` `to` $dst
-      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) `->` type($dst)
+    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
+      `to` $dst
+      (`,` `predicate` `=` $predicate^)? 
+      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
+      `->` type($dst)
   }];
   let hasVerifier = 1;
 
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index dab24bd93032692..225b9f287d340db 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -49,6 +49,24 @@ bool isColumnMajorMatmul(ArrayAttr indexingMaps);
 /// the reduction.
 bool isRowMajorBatchMatmul(ArrayAttr indexingMaps);
 
+/// Tests whether the given maps describe a vector matrix multiplication. The
+/// test is permutation-invariant. Note that this only checks the affine maps
+/// from an operation, so does not perform any checks on the math being
+/// performed within the reduction.
+bool isVecmat(ArrayAttr indexingMaps);
+
+/// Tests whether the given maps describe a matrix vector multiplication. The
+/// test is permutation-invariant. Note that this only checks the affine maps
+/// from an operation, so does not perform any checks on the math being
+/// performed within the reduction.
+bool isMatvec(ArrayAttr indexingMaps);
+
+/// Tests whether the given maps describe a batch matrix vector multiplication.
+/// The test is permutation-invariant. Note that this only checks the affine
+/// maps from an operation, so does not perform any checks on the math being
+/// performed within the reduction.
+bool isBatchMatvec(ArrayAttr indexingMaps);
+
 /// Return positions in `iteratorTypes` that match `iteratorTypeName`.
 inline void findPositionsOfType(ArrayRef<utils::IteratorType> iteratorTypes,
                                 utils::IteratorType iteratorTypeName,
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 379392ace46961a..f1fabf95a68b7ad 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -350,8 +350,7 @@ template <typename AsmPrinterT, typename T,
                                !std::is_convertible<T &, Attribute &>::value &&
                                !std::is_convertible<T &, ValueRange>::value &&
                                !std::is_convertible<T &, APFloat &>::value &&
-                               !llvm::is_one_of<T, bool, int8_t, uint8_t, float,
-                                                double>::value,
+                               !llvm::is_one_of<T, bool, float, double>::value,
                            T> * = nullptr>
 inline std::enable_if_t<std::is_base_of<AsmPrinter, AsmPrinterT>::value,
                         AsmPrinterT &>
@@ -367,17 +366,6 @@ operator<<(AsmPrinterT &p, bool value) {
   return p << (value ? StringRef("true") : "false");
 }
 
-/// Specialization for 8-bit integers to ensure values are printed as integers
-// and not characters.
-template <
-    typename AsmPrinterT, typename T,
-    std::enable_if_t<llvm::is_one_of<T, int8_t, uint8_t>::value, T> * = nullptr>
-inline std::enable_if_t<std::is_base_of<AsmPrinter, AsmPrinterT>::value,
-                        AsmPrinterT &>
-operator<<(AsmPrinterT &p, T value) {
-  return p << static_cast<int16_t>(value);
-}
-
 template <typename AsmPrinterT, typename ValueRangeT>
 inline std::enable_if_t<std::is_base_of<AsmPrinter, AsmPrinterT>::value,
                         AsmPrinterT &>
diff --git a/mlir/include/mlir/Support/ADTExtras.h b/mlir/include/mlir/Support/ADTExtras.h
index 1e4708f8f7d3f9e..51ec7ac25dbb599 100644
--- a/mlir/include/mlir/Support/ADTExtras.h
+++ b/mlir/include/mlir/Support/ADTExtras.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_SUPPORT_ADTEXTRAS_H
 #define MLIR_SUPPORT_ADTEXTRAS_H
 
+#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index c8373e06f0db776..389a4621c14e594 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -3207,7 +3207,18 @@ void mlir::python::populateIRCore(py::module &m) {
            "Inserts an operation.")
       .def_property_readonly(
           "block", [](PyInsertionPoint &self) { return self.getBlock(); },
-          "Returns the block that this InsertionPoint points to.");
+          "Returns the block that this InsertionPoint points to.")
+      .def_property_readonly(
+          "ref_operation",
+          [](PyInsertionPoint &self) -> py::object {
+            auto ref_operation = self.getRefOperation();
+            if (ref_operation)
+              return ref_operation->getObject();
+            return py::none();
+          },
+          "The reference operation before which new operations are "
+          "inserted, or None if the insertion point is at the end of "
+          "the block");
 
   //----------------------------------------------------------------------------
   // Mapping of PyAttribute.
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 3ca7dd851961a46..c5412e735dddcb5 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -833,6 +833,7 @@ class PyInsertionPoint {
                    const pybind11::object &excTb);
 
   PyBlock &getBlock() { return block; }
+  std::optional<PyOperationRef> &getRefOperation() { return refOperation; }
 
 private:
   // Trampoline constructor that avoids null initializing members while
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 7eb6f42d2788e35..efcde2ba58bd685 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -830,11 +830,11 @@ struct NVGPUMBarrierInitLowering
                                    adaptor.getMbarId(), rewriter);
     Value count = truncToI32(b, adaptor.getCount());
     if (isMbarrierShared(mbarrierType)) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
-                                                              count, Value());
+      rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(
+          op, barrier, count, adaptor.getPredicate());
     } else {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count,
-                                                        Value());
+                                                        adaptor.getPredicate());
     }
     return success();
   }
@@ -929,12 +929,12 @@ struct NVGPUMBarrierArriveExpectTxLowering
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
-          op, barrier, txcount, Value());
+          op, barrier, txcount, adaptor.getPredicate());
       return success();
     }
 
     rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxOp>(
-        op, barrier, txcount, Value());
+        op, barrier, txcount, adaptor.getPredicate());
     return success();
   }
 };
@@ -985,7 +985,8 @@ struct NVGPUTmaAsyncLoadOpLowering
     }
 
     rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
-        op, dest, adaptor.getTensorMapDescriptor(), barrier, coords, Value());
+        op, dest, adaptor.getTensorMapDescriptor(), barrier, coords,
+        adaptor.getPredicate());
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index b2a47102528758c..fc91fd994f12dc2 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -47,7 +47,7 @@ Operation *MeshDialect::materializeConstant(OpBuilder &builder, Attribute value,
 
 LogicalResult ClusterOp::verify() {
   ArrayRef<int64_t> dimSizes = getDimSizes();
-  uint8_t rank = getRank();
+  uint64_t rank = getRank();
 
   if (rank == 0)
     return emitOpError("rank of cluster is expected to be a positive integer");
@@ -71,15 +71,15 @@ LogicalResult ClusterOp::verify() {
 
 LogicalResult
 MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
-                         SymbolRefAttr, ArrayRef<DenseI8ArrayAttr> splitAxes,
-                         ArrayRef<int8_t> partialAxes, Partial) {
+                         SymbolRefAttr, ArrayRef<DenseI32ArrayAttr> splitAxes,
+                         ArrayRef<int32_t> partialAxes, Partial) {
   // TODO: At present cluster symbol ref is not verified. This is due to the
   // difficulty in fetching the corresponding symbol op based on an attribute.
 
-  llvm::SmallSet<int8_t, 4> visitedAxes;
+  llvm::SmallSet<int32_t, 4> visitedAxes;
 
-  auto checkMeshAxis = [&](ArrayRef<int8_t> axesArray) -> LogicalResult {
-    for (int8_t axis : axesArray) {
+  auto checkMeshAxis = [&](ArrayRef<int32_t> axesArray) -> LogicalResult {
+    for (int32_t axis : axesArray) {
       if (axis < 0)
         return emitError() << "mesh axis is expected to be non-negative";
       if (!visitedAxes.insert(axis).second)
@@ -88,8 +88,8 @@ MeshShardingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
     return success();
   };
 
-  for (DenseI8ArrayAttr subAxes : splitAxes) {
-    ArrayRef<int8_t> subAxesArray = subAxes.asArrayRef();
+  for (DenseI32ArrayAttr subAxes : splitAxes) {
+    ArrayRef<int32_t> subAxesArray = subAxes.asArrayRef();
     if (failed(checkMeshAxis(subAxesArray)))
       return failure();
   }
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index eaaadbbea4d0a75..408c1dc798feeb4 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -922,7 +922,7 @@ HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {
   Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
   rewriter.create<nvgpu::MBarrierInitOp>(
       loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads),
-      zero);
+      zero, Value());
   rewriter.create<gpu::BarrierOp>(loc);
   return cast<TypedValue<nvgpu::MBarrierGroupType>>(barrier);
 }
@@ -964,7 +964,8 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad(
   MLIRContext *ctx = rewriter.getContext();
   Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
   Operation *loadOp = rewriter.create<nvgpu::TmaAsyncLoadOp>(
-      loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero);
+      loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero,
+      Value());
   loadOps.push_back(loadOp);
   auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);
   SmallVector<AffineExpr> symbols(mixedSizes.size());
@@ -989,7 +990,8 @@ void HopperBuilder::buildBarrierArriveTx(
       affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes);
   Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size);
   Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal, zero);
+  rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal, zero,
+                                                   Value());
 }
 
 void HopperBuilder::buildTryWaitParity(
diff --git a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
index a2977901f4751d4..641ddf3f91cb2d9 100644
--- a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
@@ -21,9 +21,9 @@ bool mlir::isRowMajorMatmul(ArrayAttr indexingMaps) {
   if (indexingMaps.size() != 3)
     return false;
 
-  auto map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
-  auto map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
-  auto map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
 
   if (map0.getNumResults() != 2 || map1.getNumResults() != 2 ||
       map2.getNumResults() != 2 || map0.getNumInputs() != 3 ||
@@ -47,9 +47,9 @@ bool mlir::isColumnMajorMatmul(ArrayAttr indexingMaps) {
   if (indexingMaps.size() != 3)
     return false;
 
-  auto map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
-  auto map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
-  auto map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
 
   if (map0.getNumResults() != 2 || map1.getNumResults() != 2 ||
       map2.getNumResults() != 2 || map0.getNumInputs() != 3 ||
@@ -73,9 +73,9 @@ bool mlir::isRowMajorBatchMatmul(ArrayAttr indexingMaps) {
   if (indexingMaps.size() != 3)
     return false;
 
-  auto map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
-  auto map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
-  auto map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
 
   if (map0.getNumResults() != 3 || map1.getNumResults() != 3 ||
       map2.getNumResults() != 3 || map0.getNumInputs() != 4 ||
@@ -96,6 +96,79 @@ bool mlir::isRowMajorBatchMatmul(ArrayAttr indexingMaps) {
   return indexingMaps == maps;
 }
 
+bool mlir::isVecmat(ArrayAttr indexingMaps) {
+  if (indexingMaps.size() != 3)
+    return false;
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+
+  if (map0.getNumResults() != 1 || map1.getNumResults() != 2 ||
+      map2.getNumResults() != 1 || map0.getNumInputs() != 2 ||
+      map1.getNumInputs() != 2 || map2.getNumInputs() != 2) {
+    return false;
+  }
+
+  // Extract dimensions for K * KxN -> N
+  AffineExpr k = map0.getResult(0);
+  AffineExpr n = map2.getResult(0);
+  auto *context = indexingMaps.getContext();
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {k}, context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k, n}, context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, context));
+  auto maps = ArrayAttr::get(context, {mapA, mapB, mapC});
+  return indexingMaps == maps;
+}
+
+bool mlir::isMatvec(ArrayAttr indexingMaps) {
+  if (indexingMaps.size() != 3)
+    return false;
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+
+  if (map0.getNumResults() != 2 || map1.getNumResults() != 1 ||
+      map2.getNumResults() != 1 || map0.getNumInputs() != 2 ||
+      map1.getNumInputs() != 2 || map2.getNumInputs() != 2) {
+    return false;
+  }
+
+  // Extract dimensions for N*K * K -> N
+  AffineExpr k = map1.getResult(0);
+  AffineExpr n = map2.getResult(0);
+  auto *context = indexingMaps.getContext();
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {n, k}, context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k}, context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, context));
+  auto maps = ArrayAttr::get(context, {mapA, mapB, mapC});
+  return indexingMaps == maps;
+}
+
+bool mlir::isBatchMatvec(ArrayAttr indexingMaps) {
+  if (indexingMaps.size() != 3)
+    return false;
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+
+  if (map0.getNumResults() != 3 || map1.getNumResults() != 2 ||
+      map2.getNumResults() != 2 || map0.getNumInputs() != 3 ||
+      map1.getNumInputs() != 3 || map2.getNumInputs() != 3) {
+    return false;
+  }
+
+  // Extract dimensions for B*N*K * B*K -> B*N
+  AffineExpr b = map0.getResult(0);
+  AffineExpr k = map1.getResult(1);
+  AffineExpr n = map2.getResult(1);
+  auto *context = indexingMaps.getContext();
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {b, n, k}, context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {b, k}, context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {b, n}, context));
+  auto maps = ArrayAttr::get(context, {mapA, mapB, mapC});
+  return indexingMaps == maps;
+}
+
 Operation *mlir::clone(OpBuilder &b, Operation *op, TypeRange newResultTypes,
                        ValueRange newOperands) {
   IRMapping bvm;
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index e8f4440d216eeb4..2609117dd220bea 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -755,6 +755,8 @@ class InsertionPoint:
     def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
     @property
     def block(self) -> Block: ...
+    @property
+    def ref_operation(self) -> Optional[_OperationBase]: ...
 
 # TODO: Auto-generated. Audit and fix.
 class IntegerAttr(Attribute):
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index a344578def39e06..c7d28e7443695fc 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -600,6 +600,42 @@ func.func @mbarrier_txcount() {
     func.return 
 }
 
+// CHECK-LABEL: func @mbarrier_txcount_pred
+func.func @mbarrier_txcount_pred() {
+    %mine = arith.constant 1 : index
+    // CHECK: %[[c0:.+]] = arith.constant 0 : index
+    // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64
+    // CHECK: %[[S2:.+]] = gpu.thread_id  x
+    // CHECK: %[[P:.+]] = arith.cmpi eq, %[[S2]], %[[c0]] : index
+    %c0 = arith.constant 0 : index  
+    %tidx = gpu.thread_id x
+    %pred = arith.cmpi eq, %tidx, %c0 : index
+
+    // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3>
+    %barrier = nvgpu.mbarrier.create -> !barrierType
+
+    // CHECK: %[[barStr:.+]] =  builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+    // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+    // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
+    // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]]
+    nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType
+    
+    %txcount = arith.constant 256 : index
+    // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+    // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
+    // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]], {{.*}}, predicate = %[[P]]
+    nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount, predicate = %pred : !barrierType
+
+    %phase = arith.constant 0 : index
+    %ticks = arith.constant 10000000 : index
+    // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+    // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
+    // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
+    nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
+
+    func.return 
+}
+
 // CHECK-LABEL: func @async_tma_load
 !tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>,         swizzle=none,        l2promo = none,        oob = nan,  interleave = none>
 !tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>,       swizzle=swizzle_32b, l2promo = none,        oob = zero, interleave = none>
@@ -630,6 +666,32 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
   func.return 
 }
 
+// CHECK-LABEL: func @async_tma_load_pred
+func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, 
+                              %buffer1d: memref<128xf32,3>,      
+                              %buffer2d: memref<32x32xf32,3>,    
+                              %buffer3d: memref<2x32x32xf32,3>,  
+                              %buffer4d: memref<2x2x32x32xf32,3>,  
+                              %buffer5d: memref<2x2x2x32x32xf32,3>,
+                              %mbarrier: !mbarrier,
+                              %p: i1) {
+  %c0 = arith.constant 0 : index
+  %crd0 = arith.constant 0 : index
+  %crd1 = arith.constant 0 : index
+  // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}], predicate = %{{.*}}
+  nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d, predicate = %p : !tensorMap1d, !mbarrier -> memref<128xf32,3>
+  // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}], predicate = %{{.*}}
+  nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d, predicate = %p : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+  // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
+  nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d, predicate = %p : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+  // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
+  nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d, predicate = %p : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
+  // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
+  nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier[%c0] to %buffer5d, predicate = %p : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3>
+  func.return 
+}
+
+
 func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : memref<128xf32>) {
   %crd0 = arith.constant 64 : index
   %crd1 = arith.constant 128 : index
@@ -650,7 +712,7 @@ func.func @tma_prefetch(%tensorMap1d: !tensorMap1d, %p : i1) {
   // CHECK: nvvm.prefetch.tensormap %[[S0]] : !llvm.ptr
   nvgpu.tma.prefetch.descriptor %tensorMap1d: !tensorMap1d
   // CHECK: nvvm.prefetch.tensormap %[[S0]], predicate = %[[arg1]] : !llvm.ptr, i1
-  nvgpu.tma.prefetch.descriptor %tensorMap1d, %p: !tensorMap1d
+  nvgpu.tma.prefetch.descriptor %tensorMap1d, predicate = %p: !tensorMap1d
   func.return
 }
 
diff --git a/mlir/test/python/ir/insertion_point.py b/mlir/test/python/ir/insertion_point.py
index 0dc7d757f56d192..268d2e77d036f5e 100644
--- a/mlir/test/python/ir/insertion_point.py
+++ b/mlir/test/python/ir/insertion_point.py
@@ -27,6 +27,8 @@ def test_insert_at_block_end():
         )
         entry_block = module.body.operations[0].regions[0].blocks[0]
         ip = InsertionPoint(entry_block)
+        assert ip.block == entry_block
+        assert ip.ref_operation is None
         ip.insert(Operation.create("custom.op2"))
         # CHECK: "custom.op1"
         # CHECK: "custom.op2"
@@ -51,6 +53,8 @@ def test_insert_before_operation():
         )
         entry_block = module.body.operations[0].regions[0].blocks[0]
         ip = InsertionPoint(entry_block.operations[1])
+        assert ip.block == entry_block
+        assert ip.ref_operation == entry_block.operations[1]
         ip.insert(Operation.create("custom.op3"))
         # CHECK: "custom.op1"
         # CHECK: "custom.op3"
@@ -75,6 +79,8 @@ def test_insert_at_block_begin():
         )
         entry_block = module.body.operations[0].regions[0].blocks[0]
         ip = InsertionPoint.at_block_begin(entry_block)
+        assert ip.block == entry_block
+        assert ip.ref_operation == entry_block.operations[0]
         ip.insert(Operation.create("custom.op1"))
         # CHECK: "custom.op1"
         # CHECK: "custom.op2"
@@ -108,6 +114,8 @@ def test_insert_at_terminator():
         )
         entry_block = module.body.operations[0].regions[0].blocks[0]
         ip = InsertionPoint.at_block_terminator(entry_block)
+        assert ip.block == entry_block
+        assert ip.ref_operation == entry_block.operations[1]
         ip.insert(Operation.create("custom.op2"))
         # CHECK: "custom.op1"
         # CHECK: "custom.op2"
diff --git a/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
index 583dbd463b91159..3f576bacebf6aad 100644
--- a/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
@@ -240,4 +240,134 @@ TEST(isRowMajorBatchMatmul, FirstInputSwapped) {
   EXPECT_THAT(maps, Not(Truly(isRowMajorBatchMatmul)));
 }
 
+TEST(isVecmat, Simple) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isVecmat));
+}
+
+TEST(isVecmat, BindingSwapped) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, n, k); // bind in different order
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isVecmat));
+}
+
+TEST(isVecmat, WrongDimOrderMatrix) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {n, k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isVecmat)));
+}
+
+TEST(isMatvec, Simple) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {n, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isMatvec));
+}
+
+TEST(isMatvec, BindingSwapped) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, n, k); // bind in different order
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {n, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isMatvec));
+}
+
+TEST(isMatvec, WrongDimOrderMatrix) {
+  MLIRContext context;
+
+  AffineExpr k, n;
+  bindDims(&context, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(2, 0, {k, n}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(2, 0, {k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(2, 0, {n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isMatvec)));
+}
+
+TEST(isBatchMatvec, Simple) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isBatchMatvec));
+}
+
+TEST(isBatchMatvec, BindingSwapped) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, n, k); // bind in different order
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isBatchMatvec));
+}
+
+TEST(isBatchMatvec, Matmul) {
+  MLIRContext context;
+
+  AffineExpr m, n, k;
+  bindDims(&context, m, n, k);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {m, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {m, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isBatchMatvec)));
+}
+
+TEST(isBatchMatvec, WrongDimOrderMatrix) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k, n}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isBatchMatvec)));
+}
+
 } // namespace
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index 60dc439f9551c21..5db5e27ebe8886b 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -176,7 +176,7 @@ inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
   return TeamState.ICVState.*Var;
 }
 
-__attribute__((always_inline, flatten)) inline uint32_t &
+[[gnu::always_inline, gnu::flatten]] inline uint32_t &
 lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_NThreads:
@@ -218,7 +218,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   __builtin_unreachable();
 }
 
-__attribute__((always_inline, flatten)) inline void *&
+[[gnu::always_inline, gnu::flatten]] inline void *&
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_ParallelRegionFn:
@@ -232,47 +232,45 @@ lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
 /// A class without actual state used to provide a nice interface to lookup and
 /// update ICV values we can declare in global scope.
 template <typename Ty, ValueKind Kind> struct Value {
-  __attribute__((flatten, always_inline)) operator Ty() {
+  [[gnu::flatten, gnu::always_inline]] operator Ty() {
     return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr,
                   /* ForceTeamState */ false);
   }
 
-  __attribute__((flatten, always_inline)) Value &operator=(const Ty &Other) {
+  [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
     set(Other, /* IdentTy */ nullptr);
     return *this;
   }
 
-  __attribute__((flatten, always_inline)) Value &operator++() {
+  [[gnu::flatten, gnu::always_inline]] Value &operator++() {
     inc(1, /* IdentTy */ nullptr);
     return *this;
   }
 
-  __attribute__((flatten, always_inline)) Value &operator--() {
+  [[gnu::flatten, gnu::always_inline]] Value &operator--() {
     inc(-1, /* IdentTy */ nullptr);
     return *this;
   }
 
-  __attribute__((flatten, always_inline)) void
+  [[gnu::flatten, gnu::always_inline]] void
   assert_eq(const Ty &V, IdentTy *Ident = nullptr,
             bool ForceTeamState = false) {
     ASSERT(lookup(/* IsReadonly */ true, Ident, ForceTeamState) == V, nullptr);
   }
 
 private:
-  __attribute__((flatten, always_inline)) Ty &
+  [[gnu::flatten, gnu::always_inline]] Ty &
   lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
     Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
     return t;
   }
 
-  __attribute__((flatten, always_inline)) Ty &inc(int UpdateVal,
-                                                  IdentTy *Ident) {
+  [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
     return (lookup(/* IsReadonly */ false, Ident, /* ForceTeamState */ false) +=
             UpdateVal);
   }
 
-  __attribute__((flatten, always_inline)) Ty &set(Ty UpdateVal,
-                                                  IdentTy *Ident) {
+  [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
     return (lookup(/* IsReadonly */ false, Ident, /* ForceTeamState */ false) =
                 UpdateVal);
   }
@@ -284,12 +282,12 @@ template <typename Ty, ValueKind Kind> struct Value {
 /// a nice interface to lookup and update ICV values
 /// we can declare in global scope.
 template <typename Ty, ValueKind Kind> struct PtrValue {
-  __attribute__((flatten, always_inline)) operator Ty() {
+  [[gnu::flatten, gnu::always_inline]] operator Ty() {
     return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr,
                   /* ForceTeamState */ false);
   }
 
-  __attribute__((flatten, always_inline)) PtrValue &operator=(const Ty Other) {
+  [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
     set(Other);
     return *this;
   }
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index b31238fbbc9c749..af9e1a673e6a236 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -115,7 +115,7 @@ void threads(atomic::OrderingTy Ordering);
 /// (hence all threads in the block are "aligned"). Also perform a fence before
 /// and after the barrier according to \p Ordering. Note that the
 /// fence might be part of the barrier if the target offers this.
-__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
+[[gnu::noinline]] void threadsAligned(atomic::OrderingTy Ordering);
 
 #pragma omp end assumes
 ///}
diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h
index 94da763717e22fe..4ab0aea46eea122 100644
--- a/openmp/libomptarget/DeviceRTL/include/Utils.h
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -83,7 +83,7 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
 }
 
 /// A  pointer variable that has by design an `undef` value. Use with care.
-__attribute__((loader_uninitialized)) static void *const UndefPtr;
+[[clang::loader_uninitialized]] static void *const UndefPtr;
 
 #define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 809c5f03886b048..a792e5be568e6ee 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -27,8 +27,9 @@ using namespace ompx;
 
 // This variable should be visibile to the plugin so we override the default
 // hidden visibility.
-DeviceEnvironmentTy CONSTANT(__omp_rtl_device_environment)
-    __attribute__((used, retain, weak, visibility("protected")));
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] DeviceEnvironmentTy
+    CONSTANT(__omp_rtl_device_environment);
 
 uint32_t config::getDebugKind() {
   return __omp_rtl_debug_kind & __omp_rtl_device_environment.DebugKind;
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index c75a694fce35b6d..822b8dc2dd5e671 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -345,7 +345,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
 
 // TODO: This is a workaround for initialization coming from kernels outside of
 //       the TU. We will need to solve this more correctly in the future.
-int __attribute__((weak)) SHARED(IsSPMDMode);
+[[gnu::weak]] int SHARED(IsSPMDMode);
 
 void mapping::init(bool IsSPMD) {
   if (mapping::isInitialThreadInLevel0(IsSPMD))
@@ -358,15 +358,15 @@ bool mapping::isGenericMode() { return !isSPMDMode(); }
 ///}
 
 extern "C" {
-__attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
+[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
   return mapping::getThreadIdInBlock();
 }
 
-__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
+[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
   return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
-__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
+[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
   return impl::getWarpSize();
 }
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 1610b74fc78bc97..2c0701bd5358fd9 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -68,10 +68,9 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 }
 
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
-__attribute__((always_inline)) void invokeMicrotask(int32_t global_tid,
-                                                    int32_t bound_tid, void *fn,
-                                                    void **args,
-                                                    int64_t nargs) {
+[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
+                                              int32_t bound_tid, void *fn,
+                                              void **args, int64_t nargs) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
   default:
@@ -84,7 +83,7 @@ __attribute__((always_inline)) void invokeMicrotask(int32_t global_tid,
 
 extern "C" {
 
-__attribute__((always_inline)) void
+[[clang::always_inline]] void
 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                    int32_t num_threads, int proc_bind, void *fn,
                    void *wrapper_fn, void **args, int64_t nargs) {
@@ -262,8 +261,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
     __kmpc_end_sharing_variables();
 }
 
-__attribute__((noinline)) bool
-__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;
 
@@ -277,7 +275,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
   return ThreadIsActive;
 }
 
-__attribute__((noinline)) void __kmpc_kernel_end_parallel() {
+[[clang::noinline]] void __kmpc_kernel_end_parallel() {
   // In case we have modified an ICV for this thread before a ThreadState was
   // created. We drop it now to not contaminate the next parallel region.
   ASSERT(!mapping::isSPMDMode(), nullptr);
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 721137cb95d658b..c34adfb94d7c731 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -31,7 +31,7 @@ using namespace ompx;
 constexpr const uint32_t Alignment = 16;
 
 /// External symbol to access dynamic shared memory.
-extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
 
 /// The kernel environment passed to the init method by the compiler.
@@ -46,8 +46,8 @@ namespace {
 ///{
 
 extern "C" {
-__attribute__((leaf)) void *malloc(uint64_t Size);
-__attribute__((leaf)) void free(void *Ptr);
+[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
+[[gnu::weak, gnu::leaf]] void free(void *Ptr);
 }
 
 ///}
@@ -105,10 +105,8 @@ struct SharedMemorySmartStackTy {
   }
 
   /// The actual storage, shared among all warps.
-  unsigned char Data[state::SharedScratchpadSize]
-      __attribute__((aligned(Alignment)));
-  unsigned char Usage[mapping::MaxThreadsPerTeam]
-      __attribute__((aligned(Alignment)));
+  [[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
+  [[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
 };
 
 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
@@ -423,11 +421,11 @@ int omp_get_initial_device(void) { return -1; }
 }
 
 extern "C" {
-__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) {
   return memory::allocShared(Bytes, "Frontend alloc shared");
 }
 
-__attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 3370c5a8472f0b9..b9a192f0d84df9a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -523,13 +523,12 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
   impl::namedBarrier();
 }
 
-__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
-                                                          int32_t TId) {
+[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
   synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
 }
 
-__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
-                                                             int32_t TId) {
+[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
+                                                       int32_t TId) {
   synchronize::threads(atomic::OrderingTy::seq_cst);
 }
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index 6125236863098f5..b39465aaa2ace5f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -19,7 +19,7 @@
 
 using namespace ompx;
 
-extern "C" __attribute__((weak)) int IsSPMDMode;
+extern "C" [[gnu::weak]] int IsSPMDMode;
 
 namespace impl {
 
diff --git a/openmp/libomptarget/DeviceRTL/src/exports b/openmp/libomptarget/DeviceRTL/src/exports
index fbcda3ce8f555ca..288ddf90b4a9f2d 100644
--- a/openmp/libomptarget/DeviceRTL/src/exports
+++ b/openmp/libomptarget/DeviceRTL/src/exports
@@ -11,6 +11,8 @@ _ZN4ompx*
 
 IsSPMDMode
 
+malloc
+free
 memcmp
 printf
 __assert_fail