From 8410f4901691baf2a872d0b19f4d9fc98ffeb25d Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 4 Feb 2021 20:41:57 -0500
Subject: [PATCH 01/44] attempt with LLVMMethodCodeNode -> MethodCodeNode

---
 src/coreclr/jit/compiler.cpp                  |  11 +
 .../tools/Common/JitInterface/CorInfoBase.cs  |   2 +-
 .../JitInterface/CorInfoImpl.Intrinsics.cs    |   2 +-
 .../tools/Common/JitInterface/CorInfoImpl.cs  |   6 +-
 .../Common/JitInterface/JitConfigProvider.cs  |   1 +
 .../CodeGen/ILToLLVMImporter.cs               |  70 ++++++
 .../DependencyAnalysis/LLVMMethodCodeNode.cs  |  38 ++--
 .../LlvmUnboxingThunkNode.cs                  |  18 +-
 .../Compiler/LLVMCodegenCompilation.cs        |  73 ++++++-
 .../Compiler/LLVMCodegenCompilationBuilder.cs |  26 ++-
 .../Compiler/RyuJitLLVMCompilation.cs         | 200 ++++++++++++++++++
 .../ILCompiler.LLVM/ILCompiler.LLVM.csproj    |   5 +
 .../Compiler/RyuJitCompilation.cs             |   4 +-
 .../JitInterface/CorInfoImpl.RyuJit.cs        |   2 +-
 .../tools/aot/ILCompiler/ILCompiler.csproj    |  11 +-
 src/coreclr/tools/aot/ilc.sln                 |  71 ++++++-
 16 files changed, 481 insertions(+), 59 deletions(-)
 create mode 100644 src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index efdf66d864b3..3a2b32c02001 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4371,6 +4371,13 @@ void Compiler::EndPhase(Phases phase)
     mostRecentlyActivePhase = phase;
 }
 
+
+inline void DoLlvmPhase(Compiler* _compiler)
+{
+    fatal(CORJIT_SKIPPED);
+    //assert(false);
+}
+
 //------------------------------------------------------------------------
 // compCompile: run phases needed for compilation
 //
@@ -5046,6 +5053,10 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
     Rationalizer rat(this); // PHASE_RATIONALIZE
     rat.Run();
 
+    // TODO:after rat, but better before?
+    DoLlvmPhase(this); // DoPhase?
+
+    return;
     // Here we do "simple lowering".  When the RyuJIT backend works for all
     // platforms, this will be part of the more general lowering phase.  For now, though, we do a separate
     // pass of "final lowering."  We must do this before (final) liveness analysis, because this creates
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
index 5405efba6c80..582ec65e0f17 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
@@ -10,7 +10,7 @@
 
 namespace Internal.JitInterface
 {
-    unsafe partial class CorInfoImpl
+    public unsafe partial class CorInfoImpl
     {
         [UnmanagedCallersOnly]
         static uint _getMethodAttribs(IntPtr thisHandle, IntPtr* ppException, CORINFO_METHOD_STRUCT_* ftn)
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
index bd6aaca2d7f4..7797c26b372c 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
@@ -8,7 +8,7 @@
 
 namespace Internal.JitInterface
 {
-    internal unsafe partial class CorInfoImpl
+    public unsafe partial class CorInfoImpl
     {
         private struct IntrinsicKey
         {
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
index f2d3cf086a3f..3bf78005e590 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
@@ -33,7 +33,7 @@
 
 namespace Internal.JitInterface
 {
-    internal unsafe sealed partial class CorInfoImpl
+    public unsafe sealed partial class CorInfoImpl
     {
         //
         // Global initialization and state
@@ -88,7 +88,7 @@ private static CorInfoImpl GetThis(IntPtr thisHandle)
         }
 
         [DllImport(JitSupportLibrary)]
-        private extern static CorJitResult JitCompileMethod(out IntPtr exception,
+        internal extern static CorJitResult JitCompileMethod(out IntPtr exception,
             IntPtr jit, IntPtr thisHandle, IntPtr callbacks,
             ref CORINFO_METHOD_INFO info, uint flags, out IntPtr nativeEntry, out uint codeSize);
 
@@ -3363,6 +3363,8 @@ private uint getExpectedTargetArchitecture()
                     return (uint)ImageFileMachine.ARM;
                 case TargetArchitecture.ARM64:
                     return (uint)ImageFileMachine.ARM64;
+                case TargetArchitecture.Wasm32:
+                    return (uint)ImageFileMachine.AMD64; // TODO
                 default:
                     throw new NotImplementedException("Expected target architecture is not supported");
             }
diff --git a/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs b/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
index b9a806007828..bc65fd69b8d5 100644
--- a/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
+++ b/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
@@ -138,6 +138,7 @@ private static string GetTargetSpec(TargetDetails target)
                 TargetArchitecture.X64 => "x64",
                 TargetArchitecture.ARM => "arm",
                 TargetArchitecture.ARM64 => "arm64",
+                TargetArchitecture.Wasm32 => "x64", // "wasm32", == needs a clrjit module
                 _ => throw new NotImplementedException(target.Architecture.ToString())
             };
 
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
index a404c2614016..71da8e073844 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
@@ -7,6 +7,8 @@
 using System.Diagnostics;
 using System.IO;
 using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using Internal.TypeSystem;
 using ILCompiler;
 using LLVMSharp.Interop;
@@ -173,6 +175,74 @@ public ILImporter(LLVMCodegenCompilation compilation, MethodDesc method, MethodI
             _builder = Context.CreateBuilder();
         }
 
+//         [DllImport(JitSupportLibrary)]
+//         private extern static CorJitResult JitCompileMethod(out IntPtr exception,
+//             IntPtr jit, IntPtr thisHandle, IntPtr callbacks,
+//             ref CORINFO_METHOD_INFO info, uint flags, out IntPtr nativeEntry, out uint codeSize);
+//
+//         // replaces (totally?) Import method
+//         public void ImportRyuJit()
+//         {
+//             IntPtr exception;
+//             IntPtr nativeEntry;
+//             uint codeSize;
+//             var result = JitCompileMethod(out exception,
+//                     _jit, (IntPtr)Unsafe.AsPointer(ref _this), _unmanagedCallbacks,
+//                     ref methodInfo, (uint)CorJitFlag.CORJIT_FLAG_CALL_GETJITFLAGS, out nativeEntry, out codeSize);
+//             if (exception != IntPtr.Zero)
+//             {
+//                 if (_lastException != null)
+//                 {
+//                     // If we captured a managed exception, rethrow that.
+//                     // TODO: might not actually be the real reason. It could be e.g. a JIT failure/bad IL that followed
+//                     // an inlining attempt with a type system problem in it...
+// #if SUPPORT_JIT
+//                     _lastException.Throw();
+// #else
+//                     if (_lastException.SourceException is TypeSystemException)
+//                     {
+//                         // Type system exceptions can be turned into code that throws the exception at runtime.
+//                         _lastException.Throw();
+//                     }
+//                     else
+//                     {
+//                         // This is just a bug somewhere.
+//                         throw new CodeGenerationFailedException(_methodCodeNode.Method, _lastException.SourceException);
+//                     }
+// #endif
+//                 }
+//
+//                 // This is a failure we don't know much about.
+//                 char* szMessage = GetExceptionMessage(exception);
+//                 string message = szMessage != null ? new string(szMessage) : "JIT Exception";
+//                 throw new Exception(message);
+//             }
+//             if (result == CorJitResult.CORJIT_BADCODE)
+//             {
+//                 ThrowHelper.ThrowInvalidProgramException();
+//             }
+//             if (result == CorJitResult.CORJIT_IMPLLIMITATION)
+//             {
+// #if READYTORUN
+//                 throw new RequiresRuntimeJitException("JIT implementation limitation");
+// #else
+//                 ThrowHelper.ThrowInvalidProgramException();
+// #endif
+//             }
+//             if (result != CorJitResult.CORJIT_OK)
+//             {
+// #if SUPPORT_JIT
+//                 // FailFast?
+//                 throw new Exception("JIT failed");
+// #else
+//                 throw new CodeGenerationFailedException(_methodCodeNode.Method);
+// #endif
+//             }
+//
+//
+//         }
+
+
         public void Import()
         {
             FindBasicBlocks();
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
index c0972f02b745..cc287598997c 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
@@ -10,12 +10,12 @@
 
 namespace ILCompiler.DependencyAnalysis
 {
-    internal abstract class LLVMMethodCodeNode : DependencyNodeCore<NodeFactory>
+    internal abstract class LLVMMethodCodeNode : MethodCodeNode// DependencyNodeCore<NodeFactory>
     {
         protected readonly MethodDesc _method;
         protected DependencyList _dependencies;
 
-        protected LLVMMethodCodeNode(MethodDesc method)
+        protected LLVMMethodCodeNode(MethodDesc method) : base(method)
         {
             Debug.Assert(!method.IsAbstract);
             _method = method;
@@ -27,24 +27,10 @@ public void SetDependencies(DependencyList dependencies)
             _dependencies = dependencies;
         }
         
-        public MethodDesc Method
-        {
-            get
-            {
-                return _method;
-            }
-        }
-
         public override bool StaticDependenciesAreComputed => CompilationCompleted;
 
         public bool CompilationCompleted { get; set; }
 
-        public void AppendMangledName(NameMangler nameMangler, Utf8StringBuilder sb)
-        {
-            sb.Append(nameMangler.GetMangledMethodName(_method));
-        }
-        public int Offset => 0;
-        public bool RepresentsIndirectionCell => false;
 
         public override bool InterestingForDynamicDependencyAnalysis => false;
         public override bool HasDynamicDependencies => false;
@@ -63,16 +49,16 @@ public LlvmMethodBodyNode(MethodDesc method)
 
         protected override string GetName(NodeFactory factory) => this.GetMangledName(factory.NameMangler);
 
-        public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
-        {
-            var dependencies = new DependencyList();
-
-            foreach (DependencyListEntry node in _dependencies)
-                dependencies.Add(node);
-
-            return dependencies;
-        }
-
+        // public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
+        // {
+        //     var dependencies = new DependencyList();
+        //
+        //     foreach (DependencyListEntry node in _dependencies)
+        //         dependencies.Add(node);
+        //
+        //     return dependencies;
+        // }
+        //
         int ISortableNode.ClassCode => -1502960727;
 
         int ISortableNode.CompareToImpl(ISortableNode other, CompilerComparer comparer)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
index 46c6002181b8..c08c936f4086 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
@@ -15,15 +15,15 @@ public LlvmUnboxingThunkNode(MethodDesc method)
 
         protected override string GetName(NodeFactory factory) => this.GetMangledName(factory.NameMangler);
 
-        public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
-        {
-            var dependencies = new DependencyList();
-
-            foreach (DependencyListEntry node in _dependencies)
-                dependencies.Add(node);
-
-            return dependencies;
-        }
+        // public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
+        // {
+        //     var dependencies = new DependencyList();
+        //
+        //     foreach (DependencyListEntry node in _dependencies)
+        //         dependencies.Add(node);
+        //
+        //     return dependencies;
+        // }
 
         int ISortableNode.ClassCode => -18942467;
 
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 457e8ad5c59c..83e51ca5a439 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -2,7 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Collections.Generic;
-
+using System.Runtime.CompilerServices;
+using System.Threading;
 using Internal.TypeSystem;
 using Internal.IL;
 
@@ -10,26 +11,32 @@
 using ILCompiler.DependencyAnalysisFramework;
 using LLVMSharp.Interop;
 using ILCompiler.LLVM;
+using Internal.JitInterface;
+using Internal.IL.Stubs;
 
 namespace ILCompiler
 {
-    public sealed class LLVMCodegenCompilation : Compilation
+    public sealed class LLVMCodegenCompilation : RyuJitCompilation
     {
+        private readonly ConditionalWeakTable<Thread, CorInfoImpl> _corinfos = new ConditionalWeakTable<Thread, CorInfoImpl>();
+        // private CountdownEvent _compilationCountdown;
+
         internal LLVMCodegenConfigProvider Options { get; }
         internal LLVMModuleRef Module { get; }
         internal LLVMTargetDataRef TargetData { get; }
         public new LLVMCodegenNodeFactory NodeFactory { get; }
         internal LLVMDIBuilderRef DIBuilder { get; }
         internal Dictionary<string, DebugMetadata> DebugMetadataMap { get; }
-        internal LLVMCodegenCompilation(
-            DependencyAnalyzerBase<NodeFactory> dependencyGraph,
+        internal LLVMCodegenCompilation(DependencyAnalyzerBase<NodeFactory> dependencyGraph,
             LLVMCodegenNodeFactory nodeFactory,
             IEnumerable<ICompilationRootProvider> roots,
             ILProvider ilProvider,
             DebugInformationProvider debugInformationProvider,
             Logger logger,
-            LLVMCodegenConfigProvider options)
-            : base(dependencyGraph, nodeFactory, GetCompilationRoots(roots, nodeFactory), ilProvider, debugInformationProvider, null, logger)
+            LLVMCodegenConfigProvider options,
+            DevirtualizationManager devirtualizationManager,
+            InstructionSetSupport instructionSetSupport)
+            : base(dependencyGraph, nodeFactory, GetCompilationRoots(roots, nodeFactory), ilProvider, debugInformationProvider, logger, devirtualizationManager, instructionSetSupport, 0)
         {
             NodeFactory = nodeFactory;
             LLVMModuleRef m = LLVMModuleRef.CreateWithName(options.ModuleName);
@@ -59,6 +66,8 @@ protected override void CompileInternal(string outputFile, ObjectDumper dumper)
 
         protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCore<NodeFactory>> obj)
         {
+            // Determine the list of method we actually need to compile
+            var methodsToCompile = new List<MethodCodeNode>();
             foreach (var dependency in obj)
             {
                 var methodCodeNodeNeedingCode = dependency as LLVMMethodCodeNode;
@@ -74,7 +83,57 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
                 if (methodCodeNodeNeedingCode.StaticDependenciesAreComputed)
                     continue;
 
-                ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
+                methodsToCompile.Add(methodCodeNodeNeedingCode);
+
+//                ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
+            }
+            CompileSingleThreaded(methodsToCompile);
+        }
+
+        private void CompileSingleThreaded(List<MethodCodeNode> methodsToCompile)
+        {
+            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
+
+            foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
+            {
+                if (Logger.IsVerbose)
+                {
+                    Logger.Writer.WriteLine($"Compiling {methodCodeNodeNeedingCode.Method}...");
+                }
+
+                CompileSingleMethod(corInfo, methodCodeNodeNeedingCode);
+            }
+        }
+
+        private void CompileSingleMethod(CorInfoImpl corInfo, MethodCodeNode methodCodeNodeNeedingCode)
+        {
+            MethodDesc method = methodCodeNodeNeedingCode.Method;
+
+            try
+            {
+                corInfo.CompileMethod(methodCodeNodeNeedingCode);
+            }
+            catch (CodeGenerationFailedException)
+            {
+                ILImporter.CompileMethod(this, (LLVMMethodCodeNode)methodCodeNodeNeedingCode);
+            }
+            catch (TypeSystemException ex)
+            {
+                // TODO: fail compilation if a switch was passed
+
+                // Try to compile the method again, but with a throwing method body this time.
+                MethodIL throwingIL = TypeSystemThrowingILEmitter.EmitIL(method, ex);
+                corInfo.CompileMethod(methodCodeNodeNeedingCode, throwingIL);
+
+                // TODO: Log as a warning. For now, just log to the logger; but this needs to
+                // have an error code, be supressible, the method name/sig needs to be properly formatted, etc.
+                // https://github.com/dotnet/corert/issues/72
+                Logger.Writer.WriteLine($"Warning: Method `{method}` will always throw because: {ex.Message}");
+            }
+            finally
+            {
+                // if (_compilationCountdown != null)
+                //     _compilationCountdown.Signal();
             }
         }
 
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
index 54455c93cdfd..430cdf9a3afb 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
@@ -8,6 +8,7 @@
 using ILCompiler.DependencyAnalysisFramework;
 
 using Internal.IL;
+using Internal.JitInterface;
 
 namespace ILCompiler
 {
@@ -17,6 +18,7 @@ public sealed class LLVMCodegenCompilationBuilder : CompilationBuilder
         // calling the Use/Configure methods and still get something reasonable back.
         LLVMCodegenConfigProvider _config = new LLVMCodegenConfigProvider(Array.Empty<string>());
         private ILProvider _ilProvider = new CoreRTILProvider();
+        private KeyValuePair<string, string>[] _ryujitOptions = Array.Empty<KeyValuePair<string, string>>();
 
         public LLVMCodegenCompilationBuilder(CompilerTypeSystemContext context, CompilationModuleGroup group)
             : base(context, group, new CoreRTNameMangler(new LLVMNodeMangler(), false))
@@ -42,9 +44,31 @@ protected override ILProvider GetILProvider()
 
         public override ICompilation ToCompilation()
         {
+            ArrayBuilder<CorJitFlag> jitFlagBuilder = new ArrayBuilder<CorJitFlag>();
+
+            switch (_optimizationMode)
+            {
+                case OptimizationMode.None:
+                    jitFlagBuilder.Add(CorJitFlag.CORJIT_FLAG_DEBUG_CODE);
+                    break;
+
+                case OptimizationMode.PreferSize:
+                    jitFlagBuilder.Add(CorJitFlag.CORJIT_FLAG_SIZE_OPT);
+                    break;
+
+                case OptimizationMode.PreferSpeed:
+                    jitFlagBuilder.Add(CorJitFlag.CORJIT_FLAG_SPEED_OPT);
+                    break;
+
+                default:
+                    // Not setting a flag results in BLENDED_CODE.
+                    break;
+            }
+
             LLVMCodegenNodeFactory factory = new LLVMCodegenNodeFactory(_context, _compilationGroup, _metadataManager, _interopStubManager, _nameMangler, _vtableSliceProvider, _dictionaryLayoutProvider, GetPreinitializationManager());
+            JitConfigProvider.Initialize(_context.Target, jitFlagBuilder.ToArray(), _ryujitOptions);
             DependencyAnalyzerBase<NodeFactory> graph = CreateDependencyGraph(factory, new ObjectNode.ObjectNodeComparer(new CompilerComparer()));
-            return new LLVMCodegenCompilation(graph, factory, _compilationRoots, _ilProvider, _debugInformationProvider, _logger, _config);
+            return new LLVMCodegenCompilation(graph, factory, _compilationRoots, _ilProvider, _debugInformationProvider, _logger, _config, _devirtualizationManager, _instructionSetSupport);
         }
     }
 
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs
new file mode 100644
index 000000000000..53f8b6ca4e34
--- /dev/null
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs
@@ -0,0 +1,200 @@
+// // Licensed to the .NET Foundation under one or more agreements.
+// // The .NET Foundation licenses this file to you under the MIT license.
+//
+// using System;
+// using System.Collections.Generic;
+// using System.Runtime.CompilerServices;
+// using System.Threading;
+//
+// using ILCompiler.DependencyAnalysis;
+// using ILCompiler.DependencyAnalysisFramework;
+//
+// using Internal.IL;
+// using Internal.IL.Stubs;
+// using Internal.TypeSystem;
+// using Internal.JitInterface;
+//
+// namespace ILCompiler
+// {
+//     public sealed class RyuJitLLLVMCompilation : Compilation
+//     {
+//         private readonly ConditionalWeakTable<Thread, CorInfoImpl> _corinfos = new ConditionalWeakTable<Thread, CorInfoImpl>();
+//         internal readonly RyuJitCompilationOptions _compilationOptions;
+//         private readonly ExternSymbolMappedField _hardwareIntrinsicFlags;
+//         private CountdownEvent _compilationCountdown;
+//         private readonly Dictionary<string, InstructionSet> _instructionSetMap;
+//
+//         public InstructionSetSupport InstructionSetSupport { get; }
+//
+//         internal RyuJitLLLVMCompilation(
+//             DependencyAnalyzerBase<NodeFactory> dependencyGraph,
+//             NodeFactory nodeFactory,
+//             IEnumerable<ICompilationRootProvider> roots,
+//             ILProvider ilProvider,
+//             DebugInformationProvider debugInformationProvider,
+//             Logger logger,
+//             DevirtualizationManager devirtualizationManager,
+//             InstructionSetSupport instructionSetSupport,
+//             RyuJitCompilationOptions options)
+//             : base(dependencyGraph, nodeFactory, roots, ilProvider, debugInformationProvider, devirtualizationManager, logger)
+//         {
+//             _compilationOptions = options;
+//             _hardwareIntrinsicFlags = new ExternSymbolMappedField(nodeFactory.TypeSystemContext.GetWellKnownType(WellKnownType.Int32), "g_cpuFeatures");
+//             InstructionSetSupport = instructionSetSupport;
+//
+//             _instructionSetMap = new Dictionary<string, InstructionSet>();
+//             foreach (var instructionSetInfo in InstructionSetFlags.ArchitectureToValidInstructionSets(TypeSystemContext.Target.Architecture))
+//             {
+//                 if (!instructionSetInfo.Specifiable)
+//                     continue;
+//
+//                 _instructionSetMap.Add(instructionSetInfo.ManagedName, instructionSetInfo.InstructionSet);
+//             }
+//         }
+//
+//         protected override void CompileInternal(string outputFile, ObjectDumper dumper)
+//         {
+//             _dependencyGraph.ComputeMarkedNodes();
+//             var nodes = _dependencyGraph.MarkedNodeList;
+//
+//             NodeFactory.SetMarkingComplete();
+//             ObjectWriter.EmitObject(outputFile, nodes, NodeFactory, dumper);
+//         }
+//
+//         protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCore<NodeFactory>> obj)
+//         {
+//             // Determine the list of method we actually need to compile
+//             var methodsToCompile = new List<MethodCodeNode>();
+//             var canonicalMethodsToCompile = new HashSet<MethodDesc>();
+//
+//             foreach (DependencyNodeCore<NodeFactory> dependency in obj)
+//             {
+//                 var methodCodeNodeNeedingCode = dependency as MethodCodeNode;
+//                 if (methodCodeNodeNeedingCode == null)
+//                 {
+//                     // To compute dependencies of the shadow method that tracks dictionary
+//                     // dependencies we need to ensure there is code for the canonical method body.
+//                     var dependencyMethod = (ShadowConcreteMethodNode)dependency;
+//                     methodCodeNodeNeedingCode = (MethodCodeNode)dependencyMethod.CanonicalMethodNode;
+//                 }
+//
+//                 // We might have already queued this method for compilation
+//                 MethodDesc method = methodCodeNodeNeedingCode.Method;
+//                 if (method.IsCanonicalMethod(CanonicalFormKind.Any)
+//                     && !canonicalMethodsToCompile.Add(method))
+//                 {
+//                     continue;
+//                 }
+//
+//                 methodsToCompile.Add(methodCodeNodeNeedingCode);
+//             }
+//
+//             if ((_compilationOptions & RyuJitCompilationOptions.SingleThreadedCompilation) != 0)
+//             {
+//                 CompileSingleThreaded(methodsToCompile);
+//             }
+//             else
+//             {
+//                 CompileMultiThreaded(methodsToCompile);
+//             }
+//         }
+//         private void CompileMultiThreaded(List<MethodCodeNode> methodsToCompile)
+//         {
+//             if (Logger.IsVerbose)
+//             {
+//                 Logger.Writer.WriteLine($"Compiling {methodsToCompile.Count} methods...");
+//             }
+//
+//             WaitCallback compileSingleMethodDelegate = m =>
+//             {
+//                 CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
+//                 CompileSingleMethod(corInfo, (MethodCodeNode)m);
+//             };
+//
+//             using (_compilationCountdown = new CountdownEvent(methodsToCompile.Count))
+//             {
+//
+//                 foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
+//                 {
+//                     ThreadPool.QueueUserWorkItem(compileSingleMethodDelegate, methodCodeNodeNeedingCode);
+//                 }
+//
+//                 _compilationCountdown.Wait();
+//                 _compilationCountdown = null;
+//             }
+//         }
+//
+//
+//         private void CompileSingleThreaded(List<MethodCodeNode> methodsToCompile)
+//         {
+//             CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
+//
+//             foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
+//             {
+//                 if (Logger.IsVerbose)
+//                 {
+//                     Logger.Writer.WriteLine($"Compiling {methodCodeNodeNeedingCode.Method}...");
+//                 }
+//
+//                 CompileSingleMethod(corInfo, methodCodeNodeNeedingCode);
+//             }
+//         }
+//
+//         private void CompileSingleMethod(CorInfoImpl corInfo, MethodCodeNode methodCodeNodeNeedingCode)
+//         {
+//             MethodDesc method = methodCodeNodeNeedingCode.Method;
+//
+//             try
+//             {
+//                 corInfo.CompileMethod(methodCodeNodeNeedingCode);
+//             }
+//             catch (TypeSystemException ex)
+//             {
+//                 // TODO: fail compilation if a switch was passed
+//
+//                 // Try to compile the method again, but with a throwing method body this time.
+//                 MethodIL throwingIL = TypeSystemThrowingILEmitter.EmitIL(method, ex);
+//                 corInfo.CompileMethod(methodCodeNodeNeedingCode, throwingIL);
+//
+//                 // TODO: Log as a warning. For now, just log to the logger; but this needs to
+//                 // have an error code, be supressible, the method name/sig needs to be properly formatted, etc.
+//                 // https://github.com/dotnet/corert/issues/72
+//                 Logger.Writer.WriteLine($"Warning: Method `{method}` will always throw because: {ex.Message}");
+//             }
+//             finally
+//             {
+//                 if (_compilationCountdown != null)
+//                     _compilationCountdown.Signal();
+//             }
+//         }
+//
+//         public override MethodIL GetMethodIL(MethodDesc method)
+//         {
+//             TypeDesc owningType = method.OwningType;
+//             string intrinsicId = InstructionSetSupport.GetHardwareIntrinsicId(TypeSystemContext.Target.Architecture, owningType);
+//             if (!string.IsNullOrEmpty(intrinsicId)
+//                 && HardwareIntrinsicHelpers.IsIsSupportedMethod(method))
+//             {
+//                 InstructionSet instructionSet = _instructionSetMap[intrinsicId];
+//
+//                 // If this is an instruction set that is optimistically supported, but is not one of the
+//                 // intrinsics that are known to be always available, emit IL that checks the support level
+//                 // at runtime.
+//                 if (!InstructionSetSupport.IsInstructionSetSupported(instructionSet)
+//                     && InstructionSetSupport.OptimisticFlags.HasInstructionSet(instructionSet))
+//                 {
+//                     return HardwareIntrinsicHelpers.EmitIsSupportedIL(method, _hardwareIntrinsicFlags);
+//                 }
+//             }
+//
+//             return base.GetMethodIL(method);
+//         }
+//     }
+//
+//     [Flags]
+//     public enum RyuJitCompilationOptions
+//     {
+//         MethodBodyFolding = 0x1,
+//         SingleThreadedCompilation = 0x2,
+//     }
+// }
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj b/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
index d6bfa6cf525d..ed716d4e9e21 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
@@ -21,6 +21,7 @@
   <ItemGroup>
     <ProjectReference Include="..\ILCompiler.DependencyAnalysisFramework\ILCompiler.DependencyAnalysisFramework.csproj" />
     <ProjectReference Include="..\ILCompiler.MetadataTransform\ILCompiler.MetadataTransform.csproj" />
+    <ProjectReference Include="..\ILCompiler.RyuJit\ILCompiler.RyuJit.csproj" />
     <ProjectReference Include="..\ILCompiler.TypeSystem\ILCompiler.TypeSystem.csproj" />
     <ProjectReference Include="..\ILCompiler.Compiler\ILCompiler.Compiler.csproj" />
   </ItemGroup>
@@ -40,6 +41,9 @@
     <Compile Include="..\..\Common\TypeSystem\IL\HelperExtensions.cs">
       <Link>IL\HelperExtensions.cs</Link>
     </Compile>
+    <Compile Include="..\..\Common\TypeSystem\IL\Stubs\TypeSystemThrowingILEmitter.cs">
+      <Link>IL\Stubs\TypeSystemThrowingILEmitter.cs</Link>
+    </Compile>
     <Compile Include="CodeGen\DebugMetadata.cs" />
     <Compile Include="CodeGen\ILToLLVMImporter_Statics.cs" />
     <Compile Include="CodeGen\LLVMSharpInterop.cs" />
@@ -59,6 +63,7 @@
     <Compile Include="CodeGen\EvaluationStack.cs" />
     <Compile Include="CodeGen\NodeDataSection.cs" />
     <Compile Include="CodeGen\ILToLLVMImporter.cs" />
+    <Compile Include="Compiler\RyuJitLLVMCompilation.cs" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/RyuJitCompilation.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/RyuJitCompilation.cs
index 21754cf4f14e..90ad7a21d32b 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/RyuJitCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/RyuJitCompilation.cs
@@ -16,7 +16,7 @@
 
 namespace ILCompiler
 {
-    public sealed class RyuJitCompilation : Compilation
+    public class RyuJitCompilation : Compilation
     {
         private readonly ConditionalWeakTable<Thread, CorInfoImpl> _corinfos = new ConditionalWeakTable<Thread, CorInfoImpl>();
         internal readonly RyuJitCompilationOptions _compilationOptions;
@@ -26,7 +26,7 @@ public sealed class RyuJitCompilation : Compilation
 
         public InstructionSetSupport InstructionSetSupport { get; }
 
-        internal RyuJitCompilation(
+        public RyuJitCompilation(
             DependencyAnalyzerBase<NodeFactory> dependencyGraph,
             NodeFactory nodeFactory,
             IEnumerable<ICompilationRootProvider> roots,
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
index cdc7b2756257..ba6e32391940 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
@@ -22,7 +22,7 @@
 
 namespace Internal.JitInterface
 {
-    unsafe partial class CorInfoImpl
+    unsafe public partial class CorInfoImpl
     {
         private struct SequencePoint
         {
diff --git a/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj b/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
index f110ea5d4c88..cc75a03b8343 100644
--- a/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
+++ b/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
@@ -9,7 +9,7 @@
     <PlatformTarget>AnyCPU</PlatformTarget>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <AppendTargetFrameworkToOutputPath Condition="'$(BuildingInsideVisualStudio)' == 'true'">true</AppendTargetFrameworkToOutputPath>
-    <OutputPath>$(RuntimeBinDir)ilc</OutputPath>
+    <OutputPath>E:\GitHub\runtimelab\artifacts\obj\coreclr\windows.x64.Debug\jit\Debug\</OutputPath>
     <GenerateRuntimeConfigurationFiles>true</GenerateRuntimeConfigurationFiles>
     <EnableDefaultEmbeddedResourceItems>false</EnableDefaultEmbeddedResourceItems>
     <RuntimeIdentifiers>linux-x64;win-x64;osx-x64</RuntimeIdentifiers>
@@ -59,8 +59,7 @@
       <Pack>false</Pack>
     </Content>
 
-    <Content Condition="!Exists('$(ObjWriterArtifactPath)')"
-      Include="$(NuGetPackageRoot)runtime.$(ILCompilerRuntimeIdentifier).microsoft.dotnet.ilcompiler\$(ILCompilerVersion)\tools\$(LibPrefix)objwriter$(LibSuffix)">
+    <Content Condition="!Exists('$(ObjWriterArtifactPath)')" Include="$(NuGetPackageRoot)runtime.$(ILCompilerRuntimeIdentifier).microsoft.dotnet.ilcompiler\$(ILCompilerVersion)\tools\$(LibPrefix)objwriter$(LibSuffix)">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
       <Pack>false</Pack>
@@ -135,10 +134,6 @@
        On Linux renaming the library makes it difficult to debug it. -->
   <ItemGroup Condition="'$(TargetsWindows)' == 'true'">
     <Content Remove="$(RuntimeBinDir)$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)" />
-    <Content Include="$(RuntimeBinDir)$(LibPrefix)clrjit$(LibSuffix)"
-      CopyToOutputDirectory="PreserveNewest"
-      CopyToPublishDirectory="PreserveNewest"
-      Link="$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)"
-      />
+    <Content Include="$(RuntimeBinDir)$(LibPrefix)clrjit$(LibSuffix)" CopyToOutputDirectory="PreserveNewest" CopyToPublishDirectory="PreserveNewest" Link="$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)" />
   </ItemGroup>
 </Project>
diff --git a/src/coreclr/tools/aot/ilc.sln b/src/coreclr/tools/aot/ilc.sln
index 3c1d83fdf890..ffb3f67490ab 100644
--- a/src/coreclr/tools/aot/ilc.sln
+++ b/src/coreclr/tools/aot/ilc.sln
@@ -18,6 +18,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILCompiler.LLVM", "ILCompil
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "repro", "ILCompiler\repro\repro.csproj", "{CBDE0470-E0C9-4693-9A11-ACC117522F3F}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_unix_x64_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_unix_x64_x64.vcxproj", "{6B3F8F27-C386-3F07-A886-4EF8A084593D}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Checked|Any CPU = Checked|Any CPU
@@ -29,6 +31,9 @@ Global
 		Release|Any CPU = Release|Any CPU
 		Release|x64 = Release|x64
 		Release|x86 = Release|x86
+		RelWithDebInfo|Any CPU = RelWithDebInfo|Any CPU
+		RelWithDebInfo|x64 = RelWithDebInfo|x64
+		RelWithDebInfo|x86 = RelWithDebInfo|x86
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.Checked|Any CPU.ActiveCfg = Checked|x86
@@ -46,6 +51,12 @@ Global
 		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.Release|x64.Build.0 = Release|x64
 		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.Release|x86.ActiveCfg = Release|x86
 		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.Release|x86.Build.0 = Release|x86
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{6856F5F6-E568-493F-AF8A-7F624B5A02A5}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Checked|x64.ActiveCfg = Checked|x64
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Checked|x64.Build.0 = Checked|x64
@@ -61,6 +72,12 @@ Global
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Release|x64.Build.0 = Release|x64
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Release|x86.ActiveCfg = Release|x86
 		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.Release|x86.Build.0 = Release|x86
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{FB2D45F2-FA4C-42B2-8E53-3E1F30CF8046}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Checked|x64.ActiveCfg = Checked|x64
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Checked|x64.Build.0 = Checked|x64
@@ -76,6 +93,12 @@ Global
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Release|x64.Build.0 = Release|x64
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Release|x86.ActiveCfg = Release|x86
 		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.Release|x86.Build.0 = Release|x86
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{B68D5B9E-405B-4E44-B2CC-418EC78AE070}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Checked|x64.ActiveCfg = Checked|x64
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Checked|x64.Build.0 = Checked|x64
@@ -91,6 +114,12 @@ Global
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Release|x64.Build.0 = Release|x64
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Release|x86.ActiveCfg = Release|x86
 		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.Release|x86.Build.0 = Release|x86
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{05E020F4-6FA1-4DEE-8B9D-4F4B79840231}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Checked|x64.ActiveCfg = Checked|x64
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Checked|x64.Build.0 = Checked|x64
@@ -106,6 +135,12 @@ Global
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Release|x64.Build.0 = Release|x64
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Release|x86.ActiveCfg = Release|x86
 		{07221944-6A68-4B82-8461-82C7754B1B1F}.Release|x86.Build.0 = Release|x86
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{07221944-6A68-4B82-8461-82C7754B1B1F}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Checked|x64.ActiveCfg = Checked|x64
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Checked|x64.Build.0 = Checked|x64
@@ -121,6 +156,12 @@ Global
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Release|x64.Build.0 = Release|x64
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Release|x86.ActiveCfg = Release|x86
 		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.Release|x86.Build.0 = Release|x86
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{FFBD9619-DE6F-4A98-8732-8A14EC3C1A18}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Checked|x64.ActiveCfg = Checked|x64
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Checked|x64.Build.0 = Checked|x64
@@ -130,12 +171,18 @@ Global
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Debug|x64.ActiveCfg = Debug|x64
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Debug|x64.Build.0 = Debug|x64
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Debug|x86.ActiveCfg = Debug|x86
-		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Debug|x86.Build.0 = Debug|x64
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Debug|x86.Build.0 = Debug|x86
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Release|x64.ActiveCfg = Release|x64
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Release|x64.Build.0 = Release|x64
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Release|x86.ActiveCfg = Release|x86
 		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.Release|x86.Build.0 = Release|x86
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|Any CPU.ActiveCfg = Release|Any CPU
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|Any CPU.Build.0 = Release|Any CPU
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{8487612B-4DB2-4EA5-BBB3-2303659809A9}.RelWithDebInfo|x86.Build.0 = Release|x86
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Checked|Any CPU.ActiveCfg = Checked|x86
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Checked|x64.ActiveCfg = Checked|x64
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Checked|x64.Build.0 = Checked|x64
@@ -151,6 +198,28 @@ Global
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Release|x64.Build.0 = Release|x64
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Release|x86.ActiveCfg = Release|x86
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.Release|x86.Build.0 = Release|x86
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|Any CPU.ActiveCfg = Release|x64
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|Any CPU.Build.0 = Release|x64
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x64.ActiveCfg = Release|x64
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x64.Build.0 = Release|x64
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.ActiveCfg = Release|x86
+		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.Build.0 = Release|x86
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|Any CPU.ActiveCfg = Checked|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x64.ActiveCfg = Checked|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x64.Build.0 = Checked|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x86.ActiveCfg = Checked|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x64.ActiveCfg = Debug|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x64.Build.0 = Debug|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x86.ActiveCfg = Debug|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|Any CPU.ActiveCfg = Release|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x64.ActiveCfg = Release|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x64.Build.0 = Release|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x86.ActiveCfg = Release|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From fdea3812eca9bbd9f818e3029407d8827c7319c8 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 4 Feb 2021 20:42:32 -0500
Subject: [PATCH 02/44] attempt with LLVMMethodCodeNode -> MethodCodeNode

---
 .../Compiler/DependencyAnalysis/LLVMCodegenNodeFactory.cs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMCodegenNodeFactory.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMCodegenNodeFactory.cs
index f47f27c3ea21..dee0d467c6a5 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMCodegenNodeFactory.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMCodegenNodeFactory.cs
@@ -79,7 +79,7 @@ protected override IMethodNode CreateUnboxingStubNode(MethodDesc method)
 
         protected override ISymbolNode CreateReadyToRunHelperNode(ReadyToRunHelperKey helperCall)
         {
-            throw new NotSupportedException();
+            return new ReadyToRunHelperNode(helperCall.HelperId, helperCall.Target);
         }
 
         protected override ISymbolNode CreateGenericLookupFromDictionaryNode(ReadyToRunGenericHelperKey helperKey)

From 08a3e45afaf941a242b0cc5161438fbba252c4ae Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 5 Feb 2021 07:52:17 -0500
Subject: [PATCH 03/44] Fails at
 {[S.P.CoreLib]System.Runtime.CompilerServices.ClassConstructorRunner}

>	clrjit_unix_x64_x64.dll!Compiler::getPrimitiveTypeForStruct(unsigned int structSize, CORINFO_CLASS_STRUCT_ * clsHnd, bool isVarArg) Line 723	C++

with

Run-Time Check Failure #2 - Stack around the variable 'gcPtr' was corrupted.
---
 .../tools/Common/JitInterface/CorInfoImpl.cs  |  2 +-
 .../DependencyAnalysis/LLVMMethodCodeNode.cs  | 76 ++++++++++++++++---
 .../LlvmUnboxingThunkNode.cs                  | 18 ++---
 .../Compiler/LLVMCodegenCompilation.cs        | 10 +--
 .../DependencyAnalysis/MethodCodeNode.cs      | 15 +++-
 .../JitInterface/CorInfoImpl.RyuJit.cs        |  4 +-
 src/coreclr/tools/aot/ilc.sln                 | 34 ++++-----
 7 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
index 3bf78005e590..48b42c899094 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
@@ -3268,7 +3268,7 @@ private static RelocType GetRelocType(TargetArchitecture targetArchitecture, ush
                 default:
                     Debug.Fail("Invalid RelocType: " + fRelocType);
                     return 0;
-            };
+            }
         }
 
         private void recordRelocation(void* location, void* target, ushort fRelocType, ushort slotNum, int addlDelta)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
index cc287598997c..d977ffe6ef52 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
@@ -10,12 +10,12 @@
 
 namespace ILCompiler.DependencyAnalysis
 {
-    internal abstract class LLVMMethodCodeNode : MethodCodeNode// DependencyNodeCore<NodeFactory>
+    internal abstract class LLVMMethodCodeNode : DependencyNodeCore<NodeFactory>, IMethodCodeNode
     {
         protected readonly MethodDesc _method;
         protected DependencyList _dependencies;
 
-        protected LLVMMethodCodeNode(MethodDesc method) : base(method)
+        protected LLVMMethodCodeNode(MethodDesc method)
         {
             Debug.Assert(!method.IsAbstract);
             _method = method;
@@ -27,10 +27,24 @@ public void SetDependencies(DependencyList dependencies)
             _dependencies = dependencies;
         }
         
+        public MethodDesc Method
+        {
+            get
+            {
+                return _method;
+            }
+        }
+
         public override bool StaticDependenciesAreComputed => CompilationCompleted;
 
         public bool CompilationCompleted { get; set; }
 
+        public void AppendMangledName(NameMangler nameMangler, Utf8StringBuilder sb)
+        {
+            sb.Append(nameMangler.GetMangledMethodName(_method));
+        }
+        public int Offset => 0;
+        public bool RepresentsIndirectionCell => false;
 
         public override bool InterestingForDynamicDependencyAnalysis => false;
         public override bool HasDynamicDependencies => false;
@@ -38,6 +52,44 @@ public void SetDependencies(DependencyList dependencies)
 
         public override IEnumerable<CombinedDependencyListEntry> GetConditionalStaticDependencies(NodeFactory factory) => null;
         public override IEnumerable<CombinedDependencyListEntry> SearchDynamicDependencies(List<DependencyNodeCore<NodeFactory>> markedNodes, int firstNode, NodeFactory factory) => null;
+        public int ClassCode { get; }
+
+        public int CompareToImpl(ISortableNode other, CompilerComparer comparer)
+        {
+            return comparer.Compare(_method, ((LLVMMethodCodeNode)other)._method);
+        }
+
+        public void SetCode(ObjectNode.ObjectData data, bool isFoldable)
+        {
+        }
+
+        public void InitializeFrameInfos(FrameInfo[] frameInfos)
+        {
+        }
+
+        public void InitializeDebugEHClauseInfos(DebugEHClauseInfo[] debugEhClauseInfos)
+        {
+        }
+
+        public void InitializeGCInfo(byte[] gcInfo)
+        {
+        }
+
+        public void InitializeEHInfo(ObjectNode.ObjectData ehInfo)
+        {
+        }
+
+        public void InitializeDebugLocInfos(DebugLocInfo[] debugLocInfos)
+        {
+        }
+
+        public void InitializeDebugVarInfos(DebugVarInfo[] debugVarInfos)
+        {
+        }
+
+        public void InitializeNonRelocationDependencies(DependencyList additionalDependencies)
+        {
+        }
     }
 
     internal class LlvmMethodBodyNode : LLVMMethodCodeNode, IMethodBodyNode
@@ -49,16 +101,16 @@ public LlvmMethodBodyNode(MethodDesc method)
 
         protected override string GetName(NodeFactory factory) => this.GetMangledName(factory.NameMangler);
 
-        // public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
-        // {
-        //     var dependencies = new DependencyList();
-        //
-        //     foreach (DependencyListEntry node in _dependencies)
-        //         dependencies.Add(node);
-        //
-        //     return dependencies;
-        // }
-        //
+        public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
+        {
+            var dependencies = new DependencyList();
+
+            foreach (DependencyListEntry node in _dependencies)
+                dependencies.Add(node);
+
+            return dependencies;
+        }
+
         int ISortableNode.ClassCode => -1502960727;
 
         int ISortableNode.CompareToImpl(ISortableNode other, CompilerComparer comparer)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
index c08c936f4086..46c6002181b8 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LlvmUnboxingThunkNode.cs
@@ -15,15 +15,15 @@ public LlvmUnboxingThunkNode(MethodDesc method)
 
         protected override string GetName(NodeFactory factory) => this.GetMangledName(factory.NameMangler);
 
-        // public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
-        // {
-        //     var dependencies = new DependencyList();
-        //
-        //     foreach (DependencyListEntry node in _dependencies)
-        //         dependencies.Add(node);
-        //
-        //     return dependencies;
-        // }
+        public override IEnumerable<DependencyListEntry> GetStaticDependencies(NodeFactory factory)
+        {
+            var dependencies = new DependencyList();
+
+            foreach (DependencyListEntry node in _dependencies)
+                dependencies.Add(node);
+
+            return dependencies;
+        }
 
         int ISortableNode.ClassCode => -18942467;
 
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 83e51ca5a439..eacf12cf8829 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -67,7 +67,7 @@ protected override void CompileInternal(string outputFile, ObjectDumper dumper)
         protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCore<NodeFactory>> obj)
         {
             // Determine the list of method we actually need to compile
-            var methodsToCompile = new List<MethodCodeNode>();
+            var methodsToCompile = new List<LLVMMethodCodeNode>();
             foreach (var dependency in obj)
             {
                 var methodCodeNodeNeedingCode = dependency as LLVMMethodCodeNode;
@@ -90,11 +90,11 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
             CompileSingleThreaded(methodsToCompile);
         }
 
-        private void CompileSingleThreaded(List<MethodCodeNode> methodsToCompile)
+        private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
         {
             CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
 
-            foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
+            foreach (LLVMMethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
             {
                 if (Logger.IsVerbose)
                 {
@@ -105,7 +105,7 @@ private void CompileSingleThreaded(List<MethodCodeNode> methodsToCompile)
             }
         }
 
-        private void CompileSingleMethod(CorInfoImpl corInfo, MethodCodeNode methodCodeNodeNeedingCode)
+        private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodCodeNodeNeedingCode)
         {
             MethodDesc method = methodCodeNodeNeedingCode.Method;
 
@@ -115,7 +115,7 @@ private void CompileSingleMethod(CorInfoImpl corInfo, MethodCodeNode methodCodeN
             }
             catch (CodeGenerationFailedException)
             {
-                ILImporter.CompileMethod(this, (LLVMMethodCodeNode)methodCodeNodeNeedingCode);
+                ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
             }
             catch (TypeSystemException ex)
             {
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
index eb7ea23ef761..d36aa70f45e0 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
@@ -3,13 +3,26 @@
 
 using System.Diagnostics;
 using System.Text;
+using ILCompiler.DependencyAnalysisFramework;
 using Internal.Text;
 using Internal.TypeSystem;
 
 namespace ILCompiler.DependencyAnalysis
 {
+    public interface IMethodCodeNode : IMethodNode, ISymbolDefinitionNode
+    {
+        void SetCode(ObjectNode.ObjectData data, bool isFoldable);
+        void InitializeFrameInfos(FrameInfo[] frameInfos);
+        void InitializeDebugEHClauseInfos(DebugEHClauseInfo[] debugEhClauseInfos);
+        void InitializeGCInfo(byte[] gcInfo);
+        void InitializeEHInfo(ObjectNode.ObjectData ehInfo);
+        void InitializeDebugLocInfos(DebugLocInfo[] debugLocInfos);
+        void InitializeDebugVarInfos(DebugVarInfo[] debugVarInfos);
+        void InitializeNonRelocationDependencies(DependencyNodeCore<NodeFactory>.DependencyList additionalDependencies);
+    }
+
     [DebuggerTypeProxy(typeof(MethodCodeNodeDebugView))]
-    public class MethodCodeNode : ObjectNode, IMethodBodyNode, INodeWithCodeInfo, INodeWithDebugInfo, ISymbolDefinitionNode, ISpecialUnboxThunkNode
+    public class MethodCodeNode : ObjectNode, IMethodBodyNode, INodeWithCodeInfo, INodeWithDebugInfo, ISymbolDefinitionNode, ISpecialUnboxThunkNode, IMethodCodeNode
     {
         private MethodDesc _method;
         private ObjectData _methodCode;
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
index ba6e32391940..0e68a3a09917 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
@@ -36,7 +36,7 @@ private struct SequencePoint
         private int SizeOfReversePInvokeTransitionFrame => 2 * PointerSize;
 
         private RyuJitCompilation _compilation;
-        private MethodCodeNode _methodCodeNode;
+        private IMethodCodeNode _methodCodeNode;
         private DebugLocInfo[] _debugLocInfos;
         private DebugVarInfo[] _debugVarInfos;
         private Dictionary<int, SequencePoint> _sequencePoints;
@@ -58,7 +58,7 @@ private MethodDesc getUnboxingThunk(MethodDesc method)
             return _unboxingThunkFactory.GetUnboxingMethod(method);
         }
 
-        public void CompileMethod(MethodCodeNode methodCodeNodeNeedingCode, MethodIL methodIL = null)
+        public void CompileMethod(IMethodCodeNode methodCodeNodeNeedingCode, MethodIL methodIL = null)
         {
             _methodCodeNode = methodCodeNodeNeedingCode;
             _isFallbackBodyCompilation = methodIL != null;
diff --git a/src/coreclr/tools/aot/ilc.sln b/src/coreclr/tools/aot/ilc.sln
index ffb3f67490ab..6b8af92a4d87 100644
--- a/src/coreclr/tools/aot/ilc.sln
+++ b/src/coreclr/tools/aot/ilc.sln
@@ -18,7 +18,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILCompiler.LLVM", "ILCompil
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "repro", "ILCompiler\repro\repro.csproj", "{CBDE0470-E0C9-4693-9A11-ACC117522F3F}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_unix_x64_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_unix_x64_x64.vcxproj", "{6B3F8F27-C386-3F07-A886-4EF8A084593D}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x86_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x86_x64.vcxproj", "{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -204,22 +204,22 @@ Global
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x64.Build.0 = Release|x64
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.ActiveCfg = Release|x86
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.Build.0 = Release|x86
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|Any CPU.ActiveCfg = Checked|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x64.ActiveCfg = Checked|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x64.Build.0 = Checked|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Checked|x86.ActiveCfg = Checked|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|Any CPU.ActiveCfg = Debug|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x64.ActiveCfg = Debug|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x64.Build.0 = Debug|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Debug|x86.ActiveCfg = Debug|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|Any CPU.ActiveCfg = Release|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x64.ActiveCfg = Release|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x64.Build.0 = Release|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.Release|x86.ActiveCfg = Release|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{6B3F8F27-C386-3F07-A886-4EF8A084593D}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|Any CPU.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.Build.0 = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x86.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.Build.0 = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x86.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|Any CPU.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.Build.0 = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x86.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From 70f98ab2e81875fa31b74cb6363a4b4c2275d185 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 6 Feb 2021 15:21:02 -0500
Subject: [PATCH 04/44] build wasm32/64 clrjits and load the wasm clrjit dll
 for wasm32

Wasm64 has some macros defined, but otherwise not compiled
Copied mostly from the xarch and AMD64 macros to get something to compile and run.
---
 docs/workflow/building/coreclr/nativeaot.md   |    5 +
 src/coreclr/inc/cordebuginfo.h                |    2 +
 src/coreclr/inc/corinfoinstructionset.h       |   37 +-
 src/coreclr/inc/switches.h                    |    2 +
 src/coreclr/jit/CMakeLists.txt                |   55 +-
 src/coreclr/jit/codegen.h                     |    4 +-
 src/coreclr/jit/codegencommon.cpp             |   26 +-
 src/coreclr/jit/codegeninterface.h            |    4 +-
 src/coreclr/jit/codegenwasm.cpp               | 6236 ++++++++++++++
 src/coreclr/jit/compiler.cpp                  |    5 +-
 src/coreclr/jit/compiler.h                    |   40 +-
 src/coreclr/jit/emit.cpp                      |   11 +-
 src/coreclr/jit/emit.h                        |   85 +-
 src/coreclr/jit/emitdef.h                     |    2 +
 src/coreclr/jit/emitfmts.h                    |    2 +
 src/coreclr/jit/emitfmtswasm.h                |  218 +
 src/coreclr/jit/emitinl.h                     |   85 +-
 src/coreclr/jit/emitjmps.h                    |   21 +
 src/coreclr/jit/emitwasm.cpp                  | 7217 +++++++++++++++++
 src/coreclr/jit/emitwasm.h                    |  573 ++
 src/coreclr/jit/emitxarch.cpp                 |    2 +-
 src/coreclr/jit/error.h                       |   26 +
 src/coreclr/jit/gentree.cpp                   |   16 +-
 src/coreclr/jit/gentree.h                     |    8 +-
 src/coreclr/jit/gtstructs.h                   |    2 +
 src/coreclr/jit/importer.cpp                  |    2 +-
 src/coreclr/jit/instr.cpp                     |   37 +-
 src/coreclr/jit/instr.h                       |   21 +
 src/coreclr/jit/instrs.h                      |    2 +
 src/coreclr/jit/instrswasm.h                  |  774 ++
 src/coreclr/jit/jit.h                         |    6 +-
 src/coreclr/jit/lclvars.cpp                   |    2 +-
 src/coreclr/jit/liveness.cpp                  |    2 +-
 src/coreclr/jit/lower.cpp                     |    8 +-
 src/coreclr/jit/lower.h                       |    4 +-
 src/coreclr/jit/lowerwasm.cpp                 | 4907 +++++++++++
 src/coreclr/jit/lsra.h                        |   19 +-
 src/coreclr/jit/lsrawasm.cpp                  | 1528 ++++
 src/coreclr/jit/morph.cpp                     |    5 +-
 src/coreclr/jit/register.h                    |    7 +-
 src/coreclr/jit/simd.h                        |   15 +
 src/coreclr/jit/target.h                      |  789 +-
 src/coreclr/jit/targetwasm.cpp                |   33 +
 src/coreclr/jit/unwind.cpp                    |    2 +
 src/coreclr/jit/unwindwasm.cpp                |  426 +
 src/coreclr/jit/utils.cpp                     |   10 +-
 src/coreclr/jit/valuenum.cpp                  |    4 +
 src/coreclr/jit/valuenumfuncs.h               |    2 +
 .../Common/JitInterface/JitConfigProvider.cs  |    5 +-
 .../TypeSystem/Common/TargetArchitecture.cs   |    1 +
 src/coreclr/tools/aot/ilc.sln                 |   52 +-
 51 files changed, 23271 insertions(+), 76 deletions(-)
 create mode 100644 src/coreclr/jit/codegenwasm.cpp
 create mode 100644 src/coreclr/jit/emitfmtswasm.h
 create mode 100644 src/coreclr/jit/emitwasm.cpp
 create mode 100644 src/coreclr/jit/emitwasm.h
 create mode 100644 src/coreclr/jit/instrswasm.h
 create mode 100644 src/coreclr/jit/lowerwasm.cpp
 create mode 100644 src/coreclr/jit/lsrawasm.cpp
 create mode 100644 src/coreclr/jit/targetwasm.cpp
 create mode 100644 src/coreclr/jit/unwindwasm.cpp

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index 8a7ef8c8ff4f..240e1a1e4715 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -11,8 +11,13 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
    - Run `dotnet publish --packages pkg -r [win-x64|linux-x64|osx-64] -c [Debug|Release]` to publish your project. `--packages pkg` option restores the package into a local directory that is easy to cleanup once you are done. It avoids polluting the global nuget cache with your locally built dev package.
 
 ## Building for Web Assembly
+- This branch contains a version of the WebAssembly compiler that creates LLVM from the clrjit to take advantage of RyuJits optimisations.  It goes from RyuJIT IR -> LLVM instead of the NativeAOT-LLVM branch way of CIL -> LLVM.
+- It does not work, yet or maybe never.
 - Currently only tested on Windows
+- Build the x64 libraries and compiler as per the Building section.
 - Run `build nativeaot+libs+nativeaot.packages -rc [Debug|Release] -lc [Debug|Release] -a wasm -os Browser -runtimeFlavor CoreCLR`
+- The compiler can now be debugged with the Wasm clrjit.  Load the clrjit_browser_wasm32_x64.vcxproj which can be found in artifacts\obj\coreclr\windows.x64.Debug\jit
+- Run Ilc with a .rsp file as normal for Web assembly, e.g. if you build the WebAssembly tests you can use artifacts\tests\coreclr\Browser.wasm.Debug\nativeaot\SmokeTests\HelloWasm\HelloWasm\native\HelloWasm.ilc.rsp
 - Add the package directory to your `nuget.config` as above.
 - Run `dotnet publish -r browser-wasm -c [Debug|Release] /p:Platform=wasm` to publish.
 
diff --git a/src/coreclr/inc/cordebuginfo.h b/src/coreclr/inc/cordebuginfo.h
index 66b0d50b875b..466c972dc7b6 100644
--- a/src/coreclr/inc/cordebuginfo.h
+++ b/src/coreclr/inc/cordebuginfo.h
@@ -145,6 +145,8 @@ class ICorDebugInfo
         REGNUM_R13,
         REGNUM_R14,
         REGNUM_R15,
+#elif TARGET_WASM32 || TARGET_WASM64
+// TODO????
 #else
         PORTABILITY_WARNING("Register numbers not defined on this platform")
 #endif
diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h
index 75ab11d7aeab..f1e654ed4a27 100644
--- a/src/coreclr/inc/corinfoinstructionset.h
+++ b/src/coreclr/inc/corinfoinstructionset.h
@@ -108,7 +108,42 @@ enum CORINFO_InstructionSet
     InstructionSet_PCLMULQDQ_X64=33,
     InstructionSet_POPCNT_X64=34,
 #endif // TARGET_X86
-
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    //InstructionSet_X86Base = 1,
+    //InstructionSet_SSE = 2,
+    //InstructionSet_SSE2 = 3,
+    //InstructionSet_SSE3 = 4,
+    //InstructionSet_SSSE3 = 5,
+    //InstructionSet_SSE41 = 6,
+    InstructionSet_SSE42 = 7,
+    //InstructionSet_AVX = 8,
+    InstructionSet_AVX2 = 9,
+    //InstructionSet_AES = 10,
+    //InstructionSet_BMI1 = 11,
+    //InstructionSet_BMI2 = 12,
+    //InstructionSet_FMA = 13,
+    //InstructionSet_LZCNT = 14,
+    //InstructionSet_PCLMULQDQ = 15,
+    //InstructionSet_POPCNT = 16,
+    //InstructionSet_Vector128 = 17,
+    //InstructionSet_Vector256 = 18,
+    //InstructionSet_X86Base_X64 = 19,
+    //InstructionSet_SSE_X64 = 20,
+    //InstructionSet_SSE2_X64 = 21,
+    //InstructionSet_SSE3_X64 = 22,
+    //InstructionSet_SSSE3_X64 = 23,
+    //InstructionSet_SSE41_X64 = 24,
+    //InstructionSet_SSE42_X64 = 25,
+    //InstructionSet_AVX_X64 = 26,
+    //InstructionSet_AVX2_X64 = 27,
+    //InstructionSet_AES_X64 = 28,
+    //InstructionSet_BMI1_X64 = 29,
+    //InstructionSet_BMI2_X64 = 30,
+    //InstructionSet_FMA_X64 = 31,
+    //InstructionSet_LZCNT_X64 = 32,
+    //InstructionSet_PCLMULQDQ_X64 = 33,
+    //InstructionSet_POPCNT_X64 = 34,
+#endif // TARGET_AMD64
 };
 
 struct CORINFO_InstructionSetFlags
diff --git a/src/coreclr/inc/switches.h b/src/coreclr/inc/switches.h
index 8fb65335116b..7a699abfd3c6 100644
--- a/src/coreclr/inc/switches.h
+++ b/src/coreclr/inc/switches.h
@@ -65,6 +65,8 @@
     #define USE_UPPER_ADDRESS       0
 #endif // !HOST_UNIX
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    #define USE_UPPER_ADDRESS       0 // TODO : what's this?
 #else
     #error Please add a new #elif clause and define all portability macros for the new platform
 #endif
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 8cc12dda7ae1..0cca94100c90 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -23,7 +23,11 @@ function(create_standalone_jit)
     endif()
     set(JIT_ARCH_LINK_LIBRARIES gcinfo_unix_arm64)
   else()
-    set(JIT_ARCH_LINK_LIBRARIES gcinfo_${TARGETDETAILS_OS}_${TARGETDETAILS_ARCH})
+    if(TARGETDETAILS_OS STREQUAL "browser")
+      set(JIT_ARCH_LINK_LIBRARIES gcinfo_win_x64) # TODO: Wasm
+    else()
+      set(JIT_ARCH_LINK_LIBRARIES gcinfo_${TARGETDETAILS_OS}_${TARGETDETAILS_ARCH})
+    endif()
   endif()
 
   if(TARGETDETAILS_ARCH STREQUAL "x64")
@@ -34,6 +38,10 @@ function(create_standalone_jit)
     set(JIT_ARCH_SOURCES ${JIT_I386_SOURCES})
   elseif(TARGETDETAILS_ARCH STREQUAL "arm64")
     set(JIT_ARCH_SOURCES ${JIT_ARM64_SOURCES})
+  elseif(TARGETDETAILS_ARCH STREQUAL "wasm64")
+    set(JIT_ARCH_SOURCES ${JIT_WASM64_SOURCES})
+  elseif(TARGETDETAILS_ARCH STREQUAL "wasm32")
+    set(JIT_ARCH_SOURCES ${JIT_WASM32_SOURCES})
   else()
     clr_unknown_arch()
   endif()
@@ -57,8 +65,19 @@ function(create_standalone_jit)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_SIMD)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_HW_INTRINSICS)
   endif ()
+
+  if (TARGETDETAILS_ARCH STREQUAL "wasm64")
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM64)
+  endif ()
+  if (TARGETDETAILS_ARCH STREQUAL "wasm32")
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM32)
+  endif ()
 endfunction()
 
+if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
+    set(CLR_CMAKE_TARGET_ARCH_WASM 1)
+endif ()
+
 if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR (CLR_CMAKE_TARGET_ARCH_I386 AND NOT CLR_CMAKE_HOST_UNIX))
   add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_SIMD>)
   add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_HW_INTRINSICS>)
@@ -270,6 +289,7 @@ if (CLR_CMAKE_TARGET_WIN32)
     list (APPEND JIT_HEADERS
       emitfmtsxarch.h
       emitxarch.h
+      emitwasm.h
       hwintrinsiclistxarch.h
       hwintrinsic.h
       instrsxarch.h
@@ -293,6 +313,34 @@ set( JIT_AMD64_SOURCES
   hwintrinsiccodegenxarch.cpp
 )
 
+# TODO this is just a copy of AMD64_SOURCES to get started - e.g. simd,hwintrinsics doesn't make sense for wasm
+set( JIT_WASM64_SOURCES
+  codegenwasm.cpp
+  emitwasm.cpp
+  lowerwasm.cpp
+  lsrawasm.cpp
+  simd.cpp
+  simdashwintrinsic.cpp
+  simdcodegenxarch.cpp
+  targetwasm.cpp
+  unwindwasm.cpp
+  hwintrinsicxarch.cpp
+  hwintrinsiccodegenxarch.cpp
+)
+set( JIT_WASM32_SOURCES
+  codegenwasm.cpp
+  emitwasm.cpp
+  lowerwasm.cpp
+  lsrawasm.cpp
+  simd.cpp
+  simdashwintrinsic.cpp
+  simdcodegenxarch.cpp
+  targetwasm.cpp
+  unwindwasm.cpp
+  hwintrinsicxarch.cpp
+  hwintrinsiccodegenxarch.cpp
+)
+
 set( JIT_ARM_SOURCES
   codegenarmarch.cpp
   codegenarm.cpp
@@ -363,6 +411,8 @@ convert_to_absolute_path(JIT_AMD64_SOURCES ${JIT_AMD64_SOURCES})
 convert_to_absolute_path(JIT_ARM_SOURCES ${JIT_ARM_SOURCES})
 convert_to_absolute_path(JIT_I386_SOURCES ${JIT_I386_SOURCES})
 convert_to_absolute_path(JIT_ARM64_SOURCES ${JIT_ARM64_SOURCES})
+convert_to_absolute_path(JIT_WASM64_SOURCES ${JIT_WASM64_SOURCES})
+convert_to_absolute_path(JIT_WASM32_SOURCES ${JIT_WASM32_SOURCES})
 
 set(JIT_DLL_MAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/dllmain.cpp)
 
@@ -499,6 +549,9 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
+  create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+  # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+  #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 9deffd0883a5..cf9551454d60 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -50,7 +50,7 @@ class CodeGen final : public CodeGenInterface
                                    ssize_t* cnsPtr);
 
 private:
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
     // Bit masks used in negating a float or double number.
     // This is to avoid creating more than one data constant for these bitmasks when a
     // method has more than one GT_NEG operation on floating point values.
@@ -422,7 +422,7 @@ class CodeGen final : public CodeGenInterface
 
 #endif // TARGET_AMD64
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
     // Save/Restore callee saved float regs to stack
     void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 3933ace5f345..3e3f56a1f19d 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -820,7 +820,7 @@ TempDsc* CodeGenInterface::getSpillTempDsc(GenTree* tree)
     return temp;
 }
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
 #ifdef TARGET_AMD64
 // Returns relocation type hint for an addr.
@@ -1926,7 +1926,7 @@ void CodeGen::genCheckOverflow(GenTree* tree)
     {
         bool isUnsignedOverflow = ((tree->gtFlags & GTF_UNSIGNED) != 0);
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
         jumpKind = isUnsignedOverflow ? EJ_jb : EJ_jo;
 
@@ -5663,6 +5663,10 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value
                 GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+                // XORPS is the fastest and smallest way to initialize a XMM register to zero.
+                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
+                dblInitReg = reg;
 #else // TARGET*
 #error Unsupported or unset target architecture
 #endif
@@ -5698,6 +5702,10 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value
                 GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+                // XORPS is the fastest and smallest way to initialize a XMM register to zero.
+                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
+                fltInitReg = reg;
 #else // TARGET*
 #error Unsupported or unset target architecture
 #endif
@@ -6007,7 +6015,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog)
     }
 }
 
-#elif defined(TARGET_XARCH)
+#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
 void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
 {
@@ -6328,7 +6336,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
 #endif // TARGET_ARM64
         noway_assert(uCntBytes == 0);
 
-#elif defined(TARGET_XARCH)
+#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
         emitter*  emit        = GetEmitter();
         regNumber frameReg    = genFramePointerReg();
@@ -6340,7 +6348,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         noway_assert((blkSize % sizeof(int)) == 0);
         // initReg is not a live incoming argument reg
         assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0);
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         // We will align on x64 so can use the aligned mov
         instruction simdMov = simdAlignedMovIns();
         // Aligning low we want to move up to next boundary
@@ -6366,7 +6374,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
             {
                 emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i);
             }
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
             assert((i == blkSize) || (i + (int)sizeof(int) == blkSize));
             if (i != blkSize)
             {
@@ -6388,7 +6396,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
             regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4);
 #endif // UNIX_AMD64_ABI
 
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
             int       alignedLclHi;
             int       alignmentHiBlkSize;
 
@@ -8497,7 +8505,7 @@ void CodeGen::genFnEpilog(BasicBlock* block)
     compiler->unwindEndEpilog();
 }
 
-#elif defined(TARGET_XARCH)
+#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
 void CodeGen::genFnEpilog(BasicBlock* block)
 {
@@ -9872,7 +9880,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 // Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h
index 9a0c5bfba785..e5029fbfa4c5 100644
--- a/src/coreclr/jit/codegeninterface.h
+++ b/src/coreclr/jit/codegeninterface.h
@@ -116,6 +116,8 @@ class CodeGenInterface
     static const insFlags instInfo[INS_count];
 #elif defined(TARGET_ARM) || defined(TARGET_ARM64)
     static const BYTE instInfo[INS_count];
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    static const insFlags instInfo[INS_count];
 #else
 #error Unsupported target architecture
 #endif
@@ -200,7 +202,7 @@ class CodeGenInterface
 
     regNumber genGetThisArgReg(GenTreeCall* call) const;
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 #ifdef TARGET_AMD64
     // There are no reloc hints on x86
     unsigned short genAddrRelocTypeHint(size_t addr);
diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp
new file mode 100644
index 000000000000..631277f36f94
--- /dev/null
+++ b/src/coreclr/jit/codegenwasm.cpp
@@ -0,0 +1,6236 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                        Amd64/x86 Code Generator                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#pragma warning(disable : 4310) // cast truncates constant value - happens for (int8_t)0xb1
+#endif
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#include "emit.h"
+#include "codegen.h"
+#include "lower.h"
+#include "gcinfo.h"
+#include "gcinfoencoder.h"
+#include "patchpointinfo.h"
+
+/*****************************************************************************
+ *
+ *  Generate code that will set the given register to the integer constant.
+ */
+
+void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
+{
+    // Reg cannot be a FP reg
+    assert(!genIsValidFloatReg(reg));
+
+    // The only TYP_REF constant that can come this path is a managed 'null' since it is not
+    // relocatable.  Other ref type constants (e.g. string objects) go through a different
+    // code path.
+    noway_assert(type != TYP_REF || val == 0);
+
+    if (val == 0)
+    {
+        instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
+    }
+    else
+    {
+        // TODO-XArch-CQ: needs all the optimized cases
+        GetEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
+    }
+}
+
+//---------------------------------------------------------------------
+// genSetGSSecurityCookie: Set the "GS" security cookie in the prolog.
+//
+// Arguments:
+//     initReg        - register to use as a scratch register
+//     pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
+//                      this call sets 'initReg' to a non-zero value.
+//
+// Return Value:
+//     None
+//
+void CodeGen::genSetGSSecurityCookie(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (!compiler->getNeedsGSSecurityCookie())
+    {
+        return;
+    }
+
+    if (compiler->opts.IsOSR() && compiler->info.compPatchpointInfo->HasSecurityCookie())
+    {
+        // Security cookie is on original frame and was initialized there.
+        return;
+    }
+
+    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
+    {
+        noway_assert(compiler->gsGlobalSecurityCookieVal != 0);
+#ifdef TARGET_AMD64
+        if ((int)compiler->gsGlobalSecurityCookieVal != compiler->gsGlobalSecurityCookieVal)
+        {
+            // initReg = #GlobalSecurityCookieVal64; [frame.GSSecurityCookie] = initReg
+            genSetRegToIcon(initReg, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
+            GetEmitter()->emitIns_S_R(INS_mov, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0);
+            *pInitRegZeroed = false;
+        }
+        else
+#endif
+        {
+            // mov   dword ptr [frame.GSSecurityCookie], #GlobalSecurityCookieVal
+            GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
+                                      (int)compiler->gsGlobalSecurityCookieVal);
+        }
+    }
+    else
+    {
+        // Always use EAX on x86 and x64
+        // On x64, if we're not moving into RAX, and the address isn't RIP relative, we can't encode it.
+        //  mov   eax, dword ptr [compiler->gsGlobalSecurityCookieAddr]
+        //  mov   dword ptr [frame.GSSecurityCookie], eax
+        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_EAX, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+        regSet.verifyRegUsed(REG_EAX);
+        GetEmitter()->emitIns_S_R(INS_mov, EA_PTRSIZE, REG_EAX, compiler->lvaGSSecurityCookie, 0);
+        if (initReg == REG_EAX)
+        {
+            *pInitRegZeroed = false;
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ *   Generate code to check that the GS cookie wasn't thrashed by a buffer
+ *   overrun.  If pushReg is true, preserve all registers around code sequence.
+ *   Otherwise ECX could be modified.
+ *
+ *   Implementation Note: pushReg = true, in case of tail calls.
+ */
+void CodeGen::genEmitGSCookieCheck(bool pushReg)
+{
+    noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
+
+    // Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
+    // executing GS cookie check will not collect the object pointed to by EAX.
+    //
+    // For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
+    // In such case make sure that the correct GC-ness of RDX is reported as well, so
+    // a GC object pointed by RDX will not be collected.
+    if (!pushReg)
+    {
+        // Handle multi-reg return type values
+        if (compiler->compMethodReturnsMultiRegRetType())
+        {
+            ReturnTypeDesc retTypeDesc;
+            if (varTypeIsLong(compiler->info.compRetNativeType))
+            {
+                retTypeDesc.InitializeLongReturnType();
+            }
+            else // we must have a struct return type
+            {
+                retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass,
+                                                       compiler->info.compCallConv);
+            }
+
+            const unsigned regCount = retTypeDesc.GetReturnRegCount();
+
+            // Only x86 and x64 Unix ABI allows multi-reg return and
+            // number of result regs should be equal to MAX_RET_REG_COUNT.
+            assert(regCount == MAX_RET_REG_COUNT);
+
+            for (unsigned i = 0; i < regCount; ++i)
+            {
+                gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
+            }
+        }
+        else if (compiler->compMethodReturnsRetBufAddr())
+        {
+            // This is for returning in an implicit RetBuf.
+            // If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
+
+            // In case the return is in an implicit RetBuf, the native return type should be a struct
+            assert(varTypeIsStruct(compiler->info.compRetNativeType));
+
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
+        }
+        // ... all other cases.
+        else
+        {
+#ifdef TARGET_AMD64
+            // For x64, structs that are not returned in registers are always
+            // returned in implicit RetBuf. If we reached here, we should not have
+            // a RetBuf and the return type should not be a struct.
+            assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
+            assert(!varTypeIsStruct(compiler->info.compRetNativeType));
+#endif // TARGET_AMD64
+
+            // For x86 Windows we can't make such assertions since we generate code for returning of
+            // the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
+            // compRetNativeType could be TYP_STRUCT.
+            gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
+        }
+    }
+
+    regNumber regGSCheck;
+    regMaskTP regMaskGSCheck = RBM_NONE;
+
+    if (!pushReg)
+    {
+        // Non-tail call: we can use any callee trash register that is not
+        // a return register or contain 'this' pointer (keep alive this), since
+        // we are generating GS cookie check after a GT_RETURN block.
+        // Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
+        // as return register for two-register-returned structs.
+        if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
+            (compiler->lvaTable[compiler->info.compThisArg].GetRegNum() == REG_ARG_0))
+        {
+            regGSCheck = REG_ARG_1;
+        }
+        else
+        {
+            regGSCheck = REG_ARG_0;
+        }
+    }
+    else
+    {
+#ifdef TARGET_X86
+        // It doesn't matter which register we pick, since we're going to save and restore it
+        // around the check.
+        // TODO-CQ: Can we optimize the choice of register to avoid doing the push/pop sometimes?
+        regGSCheck     = REG_EAX;
+        regMaskGSCheck = RBM_EAX;
+#else  // !TARGET_X86
+        // Jmp calls: specify method handle using which JIT queries VM for its entry point
+        // address and hence it can neither be a VSD call nor PInvoke calli with cookie
+        // parameter.  Therefore, in case of jmp calls it is safe to use R11.
+        regGSCheck = REG_R11;
+#endif // !TARGET_X86
+    }
+
+    regMaskTP byrefPushedRegs = RBM_NONE;
+    regMaskTP norefPushedRegs = RBM_NONE;
+    regMaskTP pushedRegs      = RBM_NONE;
+
+    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
+    {
+#if defined(TARGET_AMD64)
+        // If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
+        // Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
+        if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
+        {
+            genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
+            GetEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+        }
+        else
+#endif // defined(TARGET_AMD64)
+        {
+            assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
+            GetEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
+                                      (int)compiler->gsGlobalSecurityCookieVal);
+        }
+    }
+    else
+    {
+        // Ngen case - GS cookie value needs to be accessed through an indirection.
+
+        pushedRegs = genPushRegs(regMaskGSCheck, &byrefPushedRegs, &norefPushedRegs);
+
+        instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
+        GetEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
+        GetEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
+    }
+
+    BasicBlock* gsCheckBlk = genCreateTempLabel();
+    inst_JMP(EJ_je, gsCheckBlk);
+    genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
+    genDefineTempLabel(gsCheckBlk);
+
+    genPopRegs(pushedRegs, byrefPushedRegs, norefPushedRegs);
+}
+
+BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
+{
+#if defined(FEATURE_EH_FUNCLETS)
+    // Generate a call to the finally, like this:
+    //      mov         rcx,qword ptr [rbp + 20H]       // Load rcx with PSPSym
+    //      call        finally-funclet
+    //      jmp         finally-return                  // Only for non-retless finally calls
+    // The jmp can be a NOP if we're going to the next block.
+    // If we're generating code for the main function (not a funclet), and there is no localloc,
+    // then RSP at this point is the same value as that stored in the PSPSym. So just copy RSP
+    // instead of loading the PSPSym in this case, or if PSPSym is not used (CoreRT ABI).
+
+    if ((compiler->lvaPSPSym == BAD_VAR_NUM) ||
+        (!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT)))
+    {
+#ifndef UNIX_X86_ABI
+        inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
+#endif // !UNIX_X86_ABI
+    }
+    else
+    {
+        GetEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
+    }
+    GetEmitter()->emitIns_J(INS_call, block->bbJumpDest);
+
+    if (block->bbFlags & BBF_RETLESS_CALL)
+    {
+        // We have a retless call, and the last instruction generated was a call.
+        // If the next block is in a different EH region (or is the end of the code
+        // block), then we need to generate a breakpoint here (since it will never
+        // get executed) to get proper unwind behavior.
+
+        if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
+        {
+            instGen(INS_BREAKPOINT); // This should never get executed
+        }
+    }
+    else
+    {
+// TODO-Linux-x86: Do we need to handle the GC information for this NOP or JMP specially, as is done for other
+// architectures?
+#ifndef JIT32_GCENCODER
+        // Because of the way the flowgraph is connected, the liveness info for this one instruction
+        // after the call is not (can not be) correct in cases where a variable has a last use in the
+        // handler.  So turn off GC reporting for this single instruction.
+        GetEmitter()->emitDisableGC();
+#endif // JIT32_GCENCODER
+
+        // Now go to where the finally funclet needs to return to.
+        if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
+        {
+            // Fall-through.
+            // TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
+            // to the next instruction? This would depend on stack walking from within the finally
+            // handler working without this instruction being in this special EH region.
+            instGen(INS_nop);
+        }
+        else
+        {
+            inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
+        }
+
+#ifndef JIT32_GCENCODER
+        GetEmitter()->emitEnableGC();
+#endif // JIT32_GCENCODER
+    }
+
+#else // !FEATURE_EH_FUNCLETS
+
+    // If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
+    // corresponding to the finally's nesting level. When invoked in response to an exception, the
+    // EE does this.
+    //
+    // We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
+    //
+    // We will emit :
+    //      mov [ebp - (n + 1)], 0
+    //      mov [ebp -  n     ], 0xFC
+    //      push &step
+    //      jmp  finallyBlock
+    // ...
+    // step:
+    //      mov [ebp -  n     ], 0
+    //      jmp leaveTarget
+    // ...
+    // leaveTarget:
+
+    noway_assert(isFramePointerUsed());
+
+    // Get the nesting level which contains the finally
+    unsigned finallyNesting = 0;
+    compiler->fgGetNestingLevel(block, &finallyNesting);
+
+    // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
+    unsigned filterEndOffsetSlotOffs;
+    filterEndOffsetSlotOffs = (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
+
+    unsigned curNestingSlotOffs;
+    curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
+
+    // Zero out the slot for the next nesting level
+    GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaShadowSPslotsVar,
+                              curNestingSlotOffs - TARGET_POINTER_SIZE, 0);
+    GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaShadowSPslotsVar, curNestingSlotOffs, LCL_FINALLY_MARK);
+
+    // Now push the address where the finally funclet should return to directly.
+    if (!(block->bbFlags & BBF_RETLESS_CALL))
+    {
+        assert(block->isBBCallAlwaysPair());
+        GetEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
+    }
+    else
+    {
+        // EE expects a DWORD, so we provide 0
+        inst_IV(INS_push_hide, 0);
+    }
+
+    // Jump to the finally BB
+    inst_JMP(EJ_jmp, block->bbJumpDest);
+
+#endif // !FEATURE_EH_FUNCLETS
+
+    // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
+    // jump target using bbJumpDest - that is already used to point
+    // to the finally block. So just skip past the BBJ_ALWAYS unless the
+    // block is RETLESS.
+    if (!(block->bbFlags & BBF_RETLESS_CALL))
+    {
+        assert(block->isBBCallAlwaysPair());
+        block = block->bbNext;
+    }
+    return block;
+}
+
+#if defined(FEATURE_EH_FUNCLETS)
+void CodeGen::genEHCatchRet(BasicBlock* block)
+{
+    // Set RAX to the address the VM should return to after the catch.
+    // Generate a RIP-relative
+    //         lea reg, [rip + disp32] ; the RIP is implicit
+    // which will be position-independent.
+    GetEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
+}
+
+#else // !FEATURE_EH_FUNCLETS
+
+void CodeGen::genEHFinallyOrFilterRet(BasicBlock* block)
+{
+    // The last statement of the block must be a GT_RETFILT, which has already been generated.
+    assert(block->lastNode() != nullptr);
+    assert(block->lastNode()->OperGet() == GT_RETFILT);
+
+    if (block->bbJumpKind == BBJ_EHFINALLYRET)
+    {
+        assert(block->lastNode()->AsOp()->gtOp1 == nullptr); // op1 == nullptr means endfinally
+
+        // Return using a pop-jmp sequence. As the "try" block calls
+        // the finally with a jmp, this leaves the x86 call-ret stack
+        // balanced in the normal flow of path.
+
+        noway_assert(isFramePointerRequired());
+        inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
+        inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
+    }
+    else
+    {
+        assert(block->bbJumpKind == BBJ_EHFILTERRET);
+
+        // The return value has already been computed.
+        instGen_Return(0);
+    }
+}
+
+#endif // !FEATURE_EH_FUNCLETS
+
+//  Move an immediate value into an integer register
+
+void CodeGen::instGen_Set_Reg_To_Imm(emitAttr  size,
+                                     regNumber reg,
+                                     ssize_t   imm,
+                                     insFlags flags DEBUGARG(size_t targetHandle) DEBUGARG(unsigned gtFlags))
+{
+    // reg cannot be a FP register
+    assert(!genIsValidFloatReg(reg));
+
+    if (!compiler->opts.compReloc)
+    {
+        size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
+    }
+
+    if ((imm == 0) && !EA_IS_RELOC(size))
+    {
+        instGen_Set_Reg_To_Zero(size, reg, flags);
+    }
+    else
+    {
+        if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
+        {
+            emitAttr newSize = EA_PTR_DSP_RELOC;
+            if (EA_IS_BYREF(size))
+            {
+                newSize = EA_SET_FLG(newSize, EA_BYREF_FLG);
+            }
+
+            GetEmitter()->emitIns_R_AI(INS_lea, newSize, reg, imm);
+        }
+        else
+        {
+            GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
+        }
+    }
+    regSet.verifyRegUsed(reg);
+}
+
+/***********************************************************************************
+ *
+ * Generate code to set a register 'targetReg' of type 'targetType' to the constant
+ * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
+ * genProduceReg() on the target register.
+ */
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
+{
+    switch (tree->gtOper)
+    {
+        case GT_CNS_INT:
+        {
+            // relocatable values tend to come down as a CNS_INT of native int type
+            // so the line between these two opcodes is kind of blurry
+            GenTreeIntConCommon* con    = tree->AsIntConCommon();
+            ssize_t              cnsVal = con->IconValue();
+
+            if (con->ImmedValNeedsReloc(compiler))
+            {
+                emitAttr size = EA_HANDLE_CNS_RELOC;
+
+                if (targetType == TYP_BYREF)
+                {
+                    size = EA_SET_FLG(size, EA_BYREF_FLG);
+                }
+
+                instGen_Set_Reg_To_Imm(size, targetReg, cnsVal);
+                regSet.verifyRegUsed(targetReg);
+            }
+            else
+            {
+                genSetRegToIcon(targetReg, cnsVal, targetType);
+            }
+        }
+        break;
+
+        case GT_CNS_DBL:
+        {
+            emitter* emit       = GetEmitter();
+            emitAttr size       = emitTypeSize(targetType);
+            double   constValue = tree->AsDblCon()->gtDconVal;
+
+            // Make sure we use "xorps reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
+            if (*(__int64*)&constValue == 0)
+            {
+                // A faster/smaller way to generate 0
+                emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
+                emit->emitIns_R_C(ins_Load(targetType), size, targetReg, hnd, 0);
+            }
+        }
+        break;
+
+        default:
+            unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForNegNot(GenTree* tree)
+{
+    assert(tree->OperIs(GT_NEG, GT_NOT));
+
+    regNumber targetReg  = tree->GetRegNum();
+    var_types targetType = tree->TypeGet();
+
+    if (varTypeIsFloating(targetType))
+    {
+        assert(tree->gtOper == GT_NEG);
+        genSSE2BitwiseOp(tree);
+    }
+    else
+    {
+        GenTree* operand = tree->gtGetOp1();
+        assert(operand->isUsedFromReg());
+        regNumber operandReg = genConsumeReg(operand);
+
+        if (operandReg != targetReg)
+        {
+            inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+        }
+
+        instruction ins = genGetInsForOper(tree->OperGet(), targetType);
+        inst_RV(ins, targetReg, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForBswap: Produce code for a GT_BSWAP / GT_BSWAP16 node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForBswap(GenTree* tree)
+{
+    // TODO: If we're swapping immediately after a read from memory or immediately before
+    // a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
+    // the platform supports it.
+
+    assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));
+
+    regNumber targetReg  = tree->GetRegNum();
+    var_types targetType = tree->TypeGet();
+
+    GenTree* operand = tree->gtGetOp1();
+    assert(operand->isUsedFromReg());
+    regNumber operandReg = genConsumeReg(operand);
+
+    if (operandReg != targetReg)
+    {
+        inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
+    }
+
+    if (tree->OperIs(GT_BSWAP))
+    {
+        // 32-bit and 64-bit byte swaps use "bswap reg"
+        inst_RV(INS_bswap, targetReg, targetType);
+    }
+    else
+    {
+        // 16-bit byte swaps use "ror reg.16, 8"
+        inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
+    }
+
+    genProduceReg(tree);
+}
+
+// Generate code to get the high N bits of a N*N=2N bit multiplication result
+void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
+{
+    assert(!treeNode->gtOverflowEx());
+
+    regNumber targetReg  = treeNode->GetRegNum();
+    var_types targetType = treeNode->TypeGet();
+    emitter*  emit       = GetEmitter();
+    emitAttr  size       = emitTypeSize(treeNode);
+    GenTree*  op1        = treeNode->AsOp()->gtOp1;
+    GenTree*  op2        = treeNode->AsOp()->gtOp2;
+
+    // to get the high bits of the multiply, we are constrained to using the
+    // 1-op form:  RDX:RAX = RAX * rm
+    // The 3-op form (Rx=Ry*Rz) does not support it.
+
+    genConsumeOperands(treeNode->AsOp());
+
+    GenTree* regOp = op1;
+    GenTree* rmOp  = op2;
+
+    // Set rmOp to the memory operand (if any)
+    if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RAX)))
+    {
+        regOp = op2;
+        rmOp  = op1;
+    }
+    assert(regOp->isUsedFromReg());
+
+    // Setup targetReg when neither of the source operands was a matching register
+    if (regOp->GetRegNum() != REG_RAX)
+    {
+        inst_RV_RV(ins_Copy(targetType), REG_RAX, regOp->GetRegNum(), targetType);
+    }
+
+    instruction ins;
+    if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
+    {
+        ins = INS_imulEAX;
+    }
+    else
+    {
+        ins = INS_mulEAX;
+    }
+    emit->emitInsBinary(ins, size, treeNode, rmOp);
+
+    // Move the result to the desired register, if necessary
+    if (treeNode->OperGet() == GT_MULHI && targetReg != REG_RDX)
+    {
+        inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+    }
+
+    genProduceReg(treeNode);
+}
+
+#ifdef TARGET_X86
+//------------------------------------------------------------------------
+// genCodeForLongUMod: Generate code for a tree of the form
+//                     `(umod (gt_long x y) (const int))`
+//
+// Arguments:
+//   node - the node for which to generate code
+//
+void CodeGen::genCodeForLongUMod(GenTreeOp* node)
+{
+    assert(node != nullptr);
+    assert(node->OperGet() == GT_UMOD);
+    assert(node->TypeGet() == TYP_INT);
+
+    GenTreeOp* const dividend = node->gtOp1->AsOp();
+    assert(dividend->OperGet() == GT_LONG);
+    assert(varTypeIsLong(dividend));
+
+    genConsumeOperands(node);
+
+    GenTree* const dividendLo = dividend->gtOp1;
+    GenTree* const dividendHi = dividend->gtOp2;
+    assert(dividendLo->isUsedFromReg());
+    assert(dividendHi->isUsedFromReg());
+
+    GenTree* const divisor = node->gtOp2;
+    assert(divisor->gtSkipReloadOrCopy()->OperGet() == GT_CNS_INT);
+    assert(divisor->gtSkipReloadOrCopy()->isUsedFromReg());
+    assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal >= 2);
+    assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal <= 0x3fffffff);
+
+    // dividendLo must be in RAX; dividendHi must be in RDX
+    genCopyRegIfNeeded(dividendLo, REG_EAX);
+    genCopyRegIfNeeded(dividendHi, REG_EDX);
+
+    // At this point, EAX:EDX contains the 64bit dividend and op2->GetRegNum()
+    // contains the 32bit divisor. We want to generate the following code:
+    //
+    //   cmp edx, divisor->GetRegNum()
+    //   jb noOverflow
+    //
+    //   mov temp, eax
+    //   mov eax, edx
+    //   xor edx, edx
+    //   div divisor->GetRegNum()
+    //   mov eax, temp
+    //
+    // noOverflow:
+    //   div divisor->GetRegNum()
+    //
+    // This works because (a * 2^32 + b) % c = ((a % c) * 2^32 + b) % c.
+
+    BasicBlock* const noOverflow = genCreateTempLabel();
+
+    //   cmp edx, divisor->GetRegNum()
+    //   jb noOverflow
+    inst_RV_RV(INS_cmp, REG_EDX, divisor->GetRegNum());
+    inst_JMP(EJ_jb, noOverflow);
+
+    //   mov temp, eax
+    //   mov eax, edx
+    //   xor edx, edx
+    //   div divisor->GetRegNum()
+    //   mov eax, temp
+    const regNumber tempReg = node->GetSingleTempReg();
+    inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
+    inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
+    instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
+    inst_RV(INS_div, divisor->GetRegNum(), TYP_INT);
+    inst_RV_RV(INS_mov, REG_EAX, tempReg, TYP_INT);
+
+    // noOverflow:
+    //   div divisor->GetRegNum()
+    genDefineTempLabel(noOverflow);
+    inst_RV(INS_div, divisor->GetRegNum(), TYP_INT);
+
+    const regNumber targetReg = node->GetRegNum();
+    if (targetReg != REG_EDX)
+    {
+        inst_RV_RV(INS_mov, targetReg, REG_RDX, TYP_INT);
+    }
+    genProduceReg(node);
+}
+#endif // TARGET_X86
+
+//------------------------------------------------------------------------
+// genCodeForDivMod: Generate code for a DIV or MOD operation.
+//
+// Arguments:
+//    treeNode - the node to generate the code for
+//
+void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
+{
+    assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
+
+    GenTree* dividend = treeNode->gtOp1;
+
+#ifdef TARGET_X86
+    if (varTypeIsLong(dividend->TypeGet()))
+    {
+        genCodeForLongUMod(treeNode);
+        return;
+    }
+#endif // TARGET_X86
+
+    GenTree*   divisor    = treeNode->gtOp2;
+    genTreeOps oper       = treeNode->OperGet();
+    emitAttr   size       = emitTypeSize(treeNode);
+    regNumber  targetReg  = treeNode->GetRegNum();
+    var_types  targetType = treeNode->TypeGet();
+    emitter*   emit       = GetEmitter();
+
+    // Node's type must be int/native int, small integer types are not
+    // supported and floating point types are handled by genCodeForBinary.
+    assert(varTypeIsIntOrI(targetType));
+    // dividend is in a register.
+    assert(dividend->isUsedFromReg());
+
+    genConsumeOperands(treeNode->AsOp());
+    // dividend must be in RAX
+    genCopyRegIfNeeded(dividend, REG_RAX);
+
+    // zero or sign extend rax to rdx
+    if (oper == GT_UMOD || oper == GT_UDIV ||
+        (dividend->IsIntegralConst() && (dividend->AsIntConCommon()->IconValue() > 0)))
+    {
+        instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
+    }
+    else
+    {
+        emit->emitIns(INS_cdq, size);
+        // the cdq instruction writes RDX, So clear the gcInfo for RDX
+        gcInfo.gcMarkRegSetNpt(RBM_RDX);
+    }
+
+    // Perform the 'targetType' (64-bit or 32-bit) divide instruction
+    instruction ins;
+    if (oper == GT_UMOD || oper == GT_UDIV)
+    {
+        ins = INS_div;
+    }
+    else
+    {
+        ins = INS_idiv;
+    }
+
+    emit->emitInsBinary(ins, size, treeNode, divisor);
+
+    // DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
+    // Move the result to the desired register, if necessary
+    if (oper == GT_DIV || oper == GT_UDIV)
+    {
+        if (targetReg != REG_RAX)
+        {
+            inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
+        }
+    }
+    else
+    {
+        assert((oper == GT_MOD) || (oper == GT_UMOD));
+        if (targetReg != REG_RDX)
+        {
+            inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
+        }
+    }
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCodeForBinary: Generate code for many binary arithmetic operators
+//
+// Arguments:
+//    treeNode - The binary operation for which we are generating code.
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    Integer MUL and DIV variants have special constraints on x64 so are not handled here.
+//    See the assert below for the operators that are handled.
+
+void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// genCodeForMul: Generate code for a MUL operation.
+//
+// Arguments:
+//    treeNode - the node to generate the code for
+//
+void CodeGen::genCodeForMul(GenTreeOp* treeNode)
+{
+    assert(false);
+}
+
+#ifdef FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// genSIMDSplitReturn: Generates code for returning a fixed-size SIMD type that lives
+//                     in a single register, but is returned in multiple registers.
+//
+// Arguments:
+//    src         - The source of the return
+//    retTypeDesc - The return type descriptor.
+//
+void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc)
+{
+    assert(varTypeIsSIMD(src));
+    assert(src->isUsedFromReg());
+
+    // This is a case of operand is in a single reg and needs to be
+    // returned in multiple ABI return registers.
+    regNumber opReg = src->GetRegNum();
+    regNumber reg0  = retTypeDesc->GetABIReturnReg(0);
+    regNumber reg1  = retTypeDesc->GetABIReturnReg(1);
+
+    assert((reg0 != REG_NA) && (reg1 != REG_NA) && (opReg != REG_NA));
+
+    const bool srcIsFloatReg = genIsValidFloatReg(opReg);
+    const bool dstIsFloatReg = genIsValidFloatReg(reg0);
+    assert(srcIsFloatReg);
+
+#ifdef TARGET_AMD64
+    assert(src->TypeIs(TYP_SIMD16));
+    assert(srcIsFloatReg == dstIsFloatReg);
+    if (opReg != reg0 && opReg != reg1)
+    {
+        // Operand reg is different from return regs.
+        // Copy opReg to reg0 and let it to be handled by one of the
+        // two cases below.
+        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg0, opReg, TYP_SIMD16);
+        opReg = reg0;
+    }
+
+    if (opReg == reg0)
+    {
+        assert(opReg != reg1);
+        // reg1 = opReg.
+        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg1, opReg, TYP_SIMD16);
+    }
+    else
+    {
+        assert(opReg == reg1);
+
+        // reg0 = opReg.
+
+        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg0, opReg, TYP_SIMD16);
+    }
+    // reg0 - already has required 8-byte in bit position [63:0].
+    // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
+    inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
+
+#else  // TARGET_X86
+    assert(src->TypeIs(TYP_SIMD8));
+    assert(srcIsFloatReg != dstIsFloatReg);
+    assert((reg0 == REG_EAX) && (reg1 == REG_EDX));
+    // reg0 = opReg[31:0]
+    inst_RV_RV(ins_Copy(opReg, TYP_INT), reg0, opReg, TYP_INT);
+    // reg1 = opRef[61:32]
+    if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+    {
+        inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1);
+    }
+    else
+    {
+        int8_t shuffleMask = 1; // we only need [61:32]->[31:0], the rest is not read.
+        inst_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, src, shuffleMask);
+        inst_RV_RV(ins_Copy(opReg, TYP_INT), reg1, opReg, TYP_INT);
+    }
+#endif // TARGET_X86
+}
+
+#endif // FEATURE_SIMD
+
+#if defined(TARGET_X86)
+
+//------------------------------------------------------------------------
+// genFloatReturn: Generates code for float return statement for x86.
+//
+// Note: treeNode's and op1's registers are already consumed.
+//
+// Arguments:
+//    treeNode - The GT_RETURN or GT_RETFILT tree node with float type.
+//
+// Return Value:
+//    None
+//
+void CodeGen::genFloatReturn(GenTree* treeNode)
+{
+    assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+    assert(varTypeIsFloating(treeNode));
+
+    GenTree* op1 = treeNode->gtGetOp1();
+    // Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
+    // If it already has a home location, use that. Otherwise, we need a temp.
+    if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->AsLclVarCommon()->GetLclNum()].lvOnFrame)
+    {
+        if (compiler->lvaTable[op1->AsLclVarCommon()->GetLclNum()].GetRegNum() != REG_STK)
+        {
+            op1->gtFlags |= GTF_SPILL;
+            inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->AsLclVarCommon()->GetLclNum())),
+                       emitTypeSize(op1->TypeGet()), op1, op1->GetRegNum());
+        }
+        // Now, load it to the fp stack.
+        GetEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->GetLclNum(), 0);
+    }
+    else
+    {
+        // Spill the value, which should be in a register, then load it to the fp stack.
+        // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
+        op1->gtFlags |= GTF_SPILL;
+        regSet.rsSpillTree(op1->GetRegNum(), op1);
+        op1->gtFlags |= GTF_SPILLED;
+        op1->gtFlags &= ~GTF_SPILL;
+
+        TempDsc* t = regSet.rsUnspillInPlace(op1, op1->GetRegNum());
+        inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
+        op1->gtFlags &= ~GTF_SPILLED;
+        regSet.tmpRlsTemp(t);
+    }
+}
+#endif // TARGET_X86
+
+//------------------------------------------------------------------------
+// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
+//
+// Arguments:
+//    tree - the node
+//
+void CodeGen::genCodeForCompare(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
+
+    // TODO-XArch-CQ: Check if we can use the currently set flags.
+    // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
+    //         (signed < or >= where targetReg != REG_NA)
+
+    GenTree*  op1     = tree->gtOp1;
+    var_types op1Type = op1->TypeGet();
+
+    if (varTypeIsFloating(op1Type))
+    {
+        genCompareFloat(tree);
+    }
+    else
+    {
+        genCompareInt(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForBT: Generates code for a GT_BT node.
+//
+// Arguments:
+//    tree - The node.
+//
+//void CodeGen::genCodeForBT(GenTreeOp* bt)
+//{
+//    assert(false);
+//};
+// clang-format on
+
+// clang-format off
+const CodeGen::GenConditionDesc CodeGen::GenConditionDesc::map[32]
+{
+    { },        // NONE
+    { },        // 1
+    { EJ_jl  }, // SLT
+    { EJ_jle }, // SLE
+    { EJ_jge }, // SGE
+    { EJ_jg  }, // SGT
+    { EJ_js  }, // S
+    { EJ_jns }, // NS
+
+    { EJ_je  }, // EQ
+    { EJ_jne }, // NE
+    { EJ_jb  }, // ULT
+    { EJ_jbe }, // ULE
+    { EJ_jae }, // UGE
+    { EJ_ja  }, // UGT
+    { EJ_jb  }, // C
+    { EJ_jae }, // NC
+
+    // Floating point compare instructions (UCOMISS, UCOMISD etc.) set the condition flags as follows:
+    //    ZF PF CF  Meaning
+    //   ---------------------
+    //    1  1  1   Unordered
+    //    0  0  0   Greater
+    //    0  0  1   Less Than
+    //    1  0  0   Equal
+    //
+    // Since ZF and CF are also set when the result is unordered, in some cases we first need to check
+    // PF before checking ZF/CF. In general, ordered conditions will result in a jump only if PF is not
+    // set and unordered conditions will result in a jump only if PF is set.
+
+    { EJ_jnp, GT_AND, EJ_je  }, // FEQ
+    { EJ_jne                 }, // FNE
+    { EJ_jnp, GT_AND, EJ_jb  }, // FLT
+    { EJ_jnp, GT_AND, EJ_jbe }, // FLE
+    { EJ_jae                 }, // FGE
+    { EJ_ja                  }, // FGT
+    { EJ_jo                  }, // O
+    { EJ_jno                 }, // NO
+
+    { EJ_je                }, // FEQU
+    { EJ_jp, GT_OR, EJ_jne }, // FNEU
+    { EJ_jb                }, // FLTU
+    { EJ_jbe               }, // FLEU
+    { EJ_jp, GT_OR, EJ_jae }, // FGEU
+    { EJ_jp, GT_OR, EJ_ja  }, // FGTU
+    { EJ_jp                }, // P
+    { EJ_jnp               }, // NP
+};
+// clang-format on
+
+
+//------------------------------------------------------------------------
+// inst_SETCC: Generate code to set a register to 0 or 1 based on a condition.
+//
+// Arguments:
+//   condition - The condition
+//   type      - The type of the value to be produced
+//   dstReg    - The destination register to be set to 1 or 0
+//
+void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstReg)
+{
+    assert(varTypeIsIntegral(type));
+    assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
+
+    const GenConditionDesc& desc = GenConditionDesc::Get(condition);
+
+    inst_SET(desc.jumpKind1, dstReg);
+
+    if (desc.oper != GT_NONE)
+    {
+        BasicBlock* labelNext = genCreateTempLabel();
+        inst_JMP((desc.oper == GT_OR) ? desc.jumpKind1 : emitter::emitReverseJumpKind(desc.jumpKind1), labelNext);
+        inst_SET(desc.jumpKind2, dstReg);
+        genDefineTempLabel(labelNext);
+    }
+
+    if (!varTypeIsByte(type))
+    {
+        GetEmitter()->emitIns_R_R(INS_movzx, EA_1BYTE, dstReg, dstReg);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
+//
+// Arguments:
+//    tree - the GT_RETURNTRAP node
+//
+void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
+{
+    assert(tree->OperGet() == GT_RETURNTRAP);
+
+    // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
+    // based on the contents of 'data'
+
+    GenTree* data = tree->gtOp1;
+    genConsumeRegs(data);
+    GenTreeIntCon cns = intForm(TYP_INT, 0);
+    cns.SetContained();
+    GetEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
+
+    BasicBlock* skipLabel = genCreateTempLabel();
+
+    inst_JMP(EJ_je, skipLabel);
+
+    // emit the call to the EE-helper that stops for GC (or other reasons)
+    regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
+    assert(genIsValidIntReg(tmpReg));
+
+    genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
+    genDefineTempLabel(skipLabel);
+}
+
+/*****************************************************************************
+ *
+ * Generate code for a single node in the tree.
+ * Preconditions: All operands have been evaluated
+ *
+ */
+void CodeGen::genCodeForTreeNode(GenTree* treeNode)
+{
+    assert(false);
+}
+
+#ifdef FEATURE_SIMD
+//----------------------------------------------------------------------------------
+// genMultiRegStoreToSIMDLocal: store multi-reg value to a single-reg SIMD local
+//
+// Arguments:
+//    lclNode  -  GentreeLclVar of GT_STORE_LCL_VAR
+//
+// Return Value:
+//    None
+//
+void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
+{
+#ifdef UNIX_AMD64_ABI
+    regNumber dst       = lclNode->GetRegNum();
+    GenTree*  op1       = lclNode->gtGetOp1();
+    GenTree*  actualOp1 = op1->gtSkipReloadOrCopy();
+    unsigned  regCount =
+        actualOp1->IsMultiRegLclVar() ? actualOp1->AsLclVar()->GetFieldCount(compiler) : actualOp1->GetMultiRegCount();
+    assert(op1->IsMultiRegNode());
+    genConsumeRegs(op1);
+
+    // Right now the only enregistrable structs supported are SIMD types.
+    // They are only returned in 1 or 2 registers - the 1 register case is
+    // handled as a regular STORE_LCL_VAR.
+    // This case is always a call (AsCall() will assert if it is not).
+    GenTreeCall*          call        = actualOp1->AsCall();
+    const ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
+    assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
+
+    assert(regCount == 2);
+    assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
+    assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
+
+    // This is a case where the two 8-bytes that comprise the operand are in
+    // two different xmm registers and need to be assembled into a single
+    // xmm register.
+    regNumber targetReg = lclNode->GetRegNum();
+    regNumber reg0      = call->GetRegNumByIdx(0);
+    regNumber reg1      = call->GetRegNumByIdx(1);
+
+    if (op1->IsCopyOrReload())
+    {
+        // GT_COPY/GT_RELOAD will have valid reg for those positions
+        // that need to be copied or reloaded.
+        regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
+        if (reloadReg != REG_NA)
+        {
+            reg0 = reloadReg;
+        }
+
+        reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
+        if (reloadReg != REG_NA)
+        {
+            reg1 = reloadReg;
+        }
+    }
+
+    if (targetReg != reg0 && targetReg != reg1)
+    {
+        // targetReg = reg0;
+        // targetReg[127:64] = reg1[127:64]
+        inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
+        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
+    }
+    else if (targetReg == reg0)
+    {
+        // (elided) targetReg = reg0
+        // targetReg[127:64] = reg1[127:64]
+        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
+    }
+    else
+    {
+        assert(targetReg == reg1);
+        // We need two shuffles to achieve this
+        // First:
+        // targetReg[63:0] = targetReg[63:0]
+        // targetReg[127:64] = reg0[63:0]
+        //
+        // Second:
+        // targetReg[63:0] = targetReg[127:64]
+        // targetReg[127:64] = targetReg[63:0]
+        //
+        // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
+        // and next swap low and high 8-bytes of targetReg to have them
+        // rearranged in the right order.
+        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
+        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
+    }
+    genProduceReg(lclNode);
+#else  // !UNIX_AMD64_ABI
+    assert(!"Multireg store to SIMD reg not supported on X64 Windows");
+#endif // !UNIX_AMD64_ABI
+}
+#endif // FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// genAllocLclFrame: Probe the stack and allocate the local stack frame - subtract from SP.
+//
+// Arguments:
+//      frameSize         - the size of the stack frame being allocated.
+//      initReg           - register to use as a scratch register.
+//      pInitRegZeroed    - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
+//                          this call sets 'initReg' to a non-zero value.
+//      maskArgRegsLiveIn - incoming argument registers that are currently live.
+//
+// Return value:
+//      None
+//
+void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
+{
+    assert(compiler->compGeneratingProlog);
+
+    if (frameSize == 0)
+    {
+        return;
+    }
+
+    const target_size_t pageSize = compiler->eeGetPageSize();
+
+    if (frameSize == REGSIZE_BYTES)
+    {
+        // Frame size is the same as register size.
+        GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX);
+        compiler->unwindAllocStack(frameSize);
+    }
+    else if (frameSize < pageSize)
+    {
+        GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
+        compiler->unwindAllocStack(frameSize);
+
+        const unsigned lastProbedLocToFinalSp = frameSize;
+
+        if (lastProbedLocToFinalSp + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize)
+        {
+            // We haven't probed almost a complete page. If the next action on the stack might subtract from SP
+            // first, before touching the current SP, then we need to probe at the very bottom. This can
+            // happen on x86, for example, when we copy an argument to the stack using a "SUB ESP; REP MOV"
+            // strategy.
+            GetEmitter()->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_SPBASE, 0);
+        }
+    }
+    else
+    {
+#ifdef TARGET_X86
+        int spOffset = -(int)frameSize;
+
+        if (compiler->info.compPublishStubParam)
+        {
+            GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
+            spOffset += REGSIZE_BYTES;
+        }
+
+        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, spOffset);
+        regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);
+
+        genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);
+
+        if (compiler->info.compPublishStubParam)
+        {
+            GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
+            GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
+        }
+        else
+        {
+            GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
+        }
+#else  // !TARGET_X86
+        static_assert_no_msg((RBM_STACK_PROBE_HELPER_ARG & (RBM_SECRET_STUB_PARAM | RBM_DEFAULT_HELPER_CALL_TARGET)) ==
+                             RBM_NONE);
+
+        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, -(int)frameSize);
+        regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);
+
+        genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);
+
+        if (initReg == REG_DEFAULT_HELPER_CALL_TARGET)
+        {
+            *pInitRegZeroed = false;
+        }
+
+        static_assert_no_msg((RBM_STACK_PROBE_HELPER_TRASH & RBM_STACK_PROBE_HELPER_ARG) == RBM_NONE);
+
+        GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
+#endif // !TARGET_X86
+
+        compiler->unwindAllocStack(frameSize);
+
+        if (initReg == REG_STACK_PROBE_HELPER_ARG)
+        {
+            *pInitRegZeroed = false;
+        }
+    }
+
+#ifdef USING_SCOPE_INFO
+    if (!doubleAlignOrFramePointerUsed())
+    {
+        psiAdjustStackLevel(frameSize);
+    }
+#endif // USING_SCOPE_INFO
+}
+
+//------------------------------------------------------------------------
+// genStackPointerConstantAdjustment: add a specified constant value to the stack pointer.
+// No probe is done.
+//
+// Arguments:
+//    spDelta                 - the value to add to SP. Must be negative or zero.
+//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
+//                              adjustment from the emitter, using this register.
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genStackPointerConstantAdjustment(ssize_t spDelta, regNumber regTmp)
+{
+    assert(spDelta < 0);
+
+    // We assert that the SP change is less than one page. If it's greater, you should have called a
+    // function that does a probe, which will in turn call this function.
+    assert((target_size_t)(-spDelta) <= compiler->eeGetPageSize());
+
+#ifdef TARGET_X86
+    if (regTmp != REG_NA)
+    {
+        // For x86, some cases don't want to use "sub ESP" because we don't want the emitter to track the adjustment
+        // to ESP. So do the work in the count register.
+        // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
+        // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
+        // track".
+        inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
+        inst_RV_IV(INS_sub, regTmp, (target_ssize_t)-spDelta, EA_PTRSIZE);
+        inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
+    }
+    else
+#endif // TARGET_X86
+    {
+        inst_RV_IV(INS_sub, REG_SPBASE, (target_ssize_t)-spDelta, EA_PTRSIZE);
+    }
+}
+
+//------------------------------------------------------------------------
+// genStackPointerConstantAdjustmentWithProbe: add a specified constant value to the stack pointer,
+// and probe the stack as appropriate. Should only be called as a helper for
+// genStackPointerConstantAdjustmentLoopWithProbe.
+//
+// Arguments:
+//    spDelta                 - the value to add to SP. Must be negative or zero. If zero, the probe happens,
+//                              but the stack pointer doesn't move.
+//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
+//                              adjustment from the emitter, using this register.
+//
+// Return Value:
+//    None.
+//
+void CodeGen::genStackPointerConstantAdjustmentWithProbe(ssize_t spDelta, regNumber regTmp)
+{
+    GetEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
+    genStackPointerConstantAdjustment(spDelta, regTmp);
+}
+
+//------------------------------------------------------------------------
+// genStackPointerConstantAdjustmentLoopWithProbe: Add a specified constant value to the stack pointer,
+// and probe the stack as appropriate. Generates one probe per page, up to the total amount required.
+// This will generate a sequence of probes in-line. It is required for the case where we need to expose
+// (not hide) the stack level adjustment. We can't use the dynamic loop in that case, because the total
+// stack adjustment would not be visible to the emitter. It would be possible to use this version for
+// multiple hidden constant stack level adjustments but we don't do that currently (we use the loop
+// version in genStackPointerDynamicAdjustmentWithProbe instead).
+//
+// Arguments:
+//    spDelta                 - the value to add to SP. Must be negative.
+//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
+//                              adjustment from the emitter, using this register.
+//
+// Return Value:
+//    Offset in bytes from SP to last probed address.
+//
+target_ssize_t CodeGen::genStackPointerConstantAdjustmentLoopWithProbe(ssize_t spDelta, regNumber regTmp)
+{
+    assert(spDelta < 0);
+
+    const target_size_t pageSize = compiler->eeGetPageSize();
+
+    ssize_t spRemainingDelta = spDelta;
+    do
+    {
+        ssize_t spOneDelta = -(ssize_t)min((target_size_t)-spRemainingDelta, pageSize);
+        genStackPointerConstantAdjustmentWithProbe(spOneDelta, regTmp);
+        spRemainingDelta -= spOneDelta;
+    } while (spRemainingDelta < 0);
+
+    // What offset from the final SP was the last probe? This depends on the fact that
+    // genStackPointerConstantAdjustmentWithProbe() probes first, then does "SUB SP".
+    target_size_t lastTouchDelta = (target_size_t)(-spDelta) % pageSize;
+    if ((lastTouchDelta == 0) || (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize))
+    {
+        // We haven't probed almost a complete page. If lastTouchDelta==0, then spDelta was an exact
+        // multiple of pageSize, which means we last probed exactly one page back. Otherwise, we probed
+        // the page, but very far from the end. If the next action on the stack might subtract from SP
+        // first, before touching the current SP, then we do one more probe at the very bottom. This can
+        // happen on x86, for example, when we copy an argument to the stack using a "SUB ESP; REP MOV"
+        // strategy.
+
+        GetEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, 0);
+        lastTouchDelta = 0;
+    }
+
+    return lastTouchDelta;
+}
+
+//------------------------------------------------------------------------
+// genStackPointerDynamicAdjustmentWithProbe: add a register value to the stack pointer,
+// and probe the stack as appropriate.
+//
+// Note that for x86, we hide the ESP adjustment from the emitter. To do that, currently,
+// requires a temporary register and extra code.
+//
+// Arguments:
+//    regSpDelta              - the register value to add to SP. The value in this register must be negative.
+//                              This register might be trashed.
+//    regTmp                  - an available temporary register. Will be trashed.
+//
+// Return Value:
+//    None.
+//
+//void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta, regNumber regTmp)
+//{
+//    assert(false);
+//}
+
+//------------------------------------------------------------------------
+// genLclHeap: Generate code for localloc.
+//
+// Arguments:
+//      tree - the localloc tree to generate.
+//
+// Notes:
+//      Note that for x86, we don't track ESP movements while generating the localloc code.
+//      The ESP tracking is used to report stack pointer-relative GC info, which is not
+//      interesting while doing the localloc construction. Also, for functions with localloc,
+//      we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
+//      call arguments.
+//
+//      For x86, we store the ESP after the localloc is complete in the LocAllocSP
+//      variable. This variable is implicitly reported to the VM in the GC info (its position
+//      is defined by convention relative to other items), and is used by the GC to find the
+//      "base" stack pointer in functions with localloc.
+//
+void CodeGen::genLclHeap(GenTree* tree)
+{
+    assert(false);
+}
+
+//
+//------------------------------------------------------------------------
+// genCodeForInitBlkRepStos: Generate code for InitBlk using rep stos.
+//
+// Arguments:
+//    initBlkNode - The Block store for which we are generating code.
+//
+void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
+{
+    genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
+    instGen(INS_r_stosb);
+}
+
+//----------------------------------------------------------------------------------
+// genCodeForInitBlkUnroll: Generate unrolled block initialization code.
+//
+// Arguments:
+//    node - the GT_STORE_BLK node to generate code for
+//
+void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
+{
+    assert(node->OperIs(GT_STORE_BLK));
+
+    unsigned  dstLclNum         = BAD_VAR_NUM;
+    regNumber dstAddrBaseReg    = REG_NA;
+    regNumber dstAddrIndexReg   = REG_NA;
+    unsigned  dstAddrIndexScale = 1;
+    int       dstOffset         = 0;
+    GenTree*  dstAddr           = node->Addr();
+
+    if (!dstAddr->isContained())
+    {
+        dstAddrBaseReg = genConsumeReg(dstAddr);
+    }
+    else if (dstAddr->OperIsAddrMode())
+    {
+        GenTreeAddrMode* addrMode = dstAddr->AsAddrMode();
+
+        if (addrMode->HasBase())
+        {
+            dstAddrBaseReg = genConsumeReg(addrMode->Base());
+        }
+
+        if (addrMode->HasIndex())
+        {
+            dstAddrIndexReg   = genConsumeReg(addrMode->Index());
+            dstAddrIndexScale = addrMode->GetScale();
+        }
+
+        dstOffset = addrMode->Offset();
+    }
+    else
+    {
+        assert(dstAddr->OperIsLocalAddr());
+        dstLclNum = dstAddr->AsLclVarCommon()->GetLclNum();
+        dstOffset = dstAddr->AsLclVarCommon()->GetLclOffs();
+    }
+
+    regNumber srcIntReg = REG_NA;
+    GenTree*  src       = node->Data();
+
+    if (src->OperIs(GT_INIT_VAL))
+    {
+        assert(src->isContained());
+        src = src->AsUnOp()->gtGetOp1();
+    }
+
+    if (!src->isContained())
+    {
+        srcIntReg = genConsumeReg(src);
+    }
+    else
+    {
+        // If src is contained then it must be 0 and the size must be a multiple
+        // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
+        assert(src->IsIntegralConst(0));
+        assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
+    }
+
+    emitter* emit = GetEmitter();
+    unsigned size = node->GetLayout()->GetSize();
+
+    assert(size <= INT32_MAX);
+    assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
+
+    // Fill as much as possible using SSE2 stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
+
+        if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
+        {
+            // If the source is constant 0 then always use xorps, it's faster
+            // than copying the constant from a GPR to a XMM register.
+            emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg);
+        }
+        else
+        {
+            emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, srcXmmReg, srcIntReg);
+            emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
+#ifdef TARGET_X86
+            // For x86, we need one more to convert it from 8 bytes to 16 bytes.
+            emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
+#endif
+        }
+
+        instruction simdMov = simdUnalignedMovIns();
+        for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
+        {
+            if (dstLclNum != BAD_VAR_NUM)
+            {
+                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
+            }
+            else
+            {
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
+                                    dstAddrIndexScale, dstOffset);
+            }
+        }
+
+        // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
+        // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
+        // to allocate a GPR just for the remainder.
+    }
+
+    // Fill the remainder using normal stores.
+    for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
+    {
+        while (regSize > size)
+        {
+            regSize /= 2;
+        }
+
+        if (dstLclNum != BAD_VAR_NUM)
+        {
+            emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
+        }
+        else
+        {
+            emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
+                                dstAddrIndexScale, dstOffset);
+        }
+    }
+}
+
+#ifdef TARGET_AMD64
+//------------------------------------------------------------------------
+// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
+//
+// Arguments:
+//    initBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
+//
+// Preconditions:
+//   The register assignments have been set appropriately.
+//   This is validated by genConsumeBlockOp().
+//
+void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode)
+{
+    // Destination address goes in arg0, source address goes in arg1, and size goes in arg2.
+    // genConsumeBlockOp takes care of this for us.
+    genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+    genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
+}
+#endif // TARGET_AMD64
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+// Generate code for a load from some address + offset
+//   baseNode: tree node which can be either a local address or arbitrary node
+//   offset: distance from the baseNode from which to load
+void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
+{
+    emitter* emit = GetEmitter();
+
+    if (baseNode->OperIsLocalAddr())
+    {
+        const GenTreeLclVarCommon* lclVar = baseNode->AsLclVarCommon();
+        offset += lclVar->GetLclOffs();
+        emit->emitIns_R_S(ins, size, dst, lclVar->GetLclNum(), offset);
+    }
+    else
+    {
+        emit->emitIns_R_AR(ins, size, dst, baseNode->GetRegNum(), offset);
+    }
+}
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//----------------------------------------------------------------------------------
+// genCodeForCpBlkUnroll - Generate unrolled block copy code.
+//
+// Arguments:
+//    node - the GT_STORE_BLK node to generate code for
+//
+void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
+{
+    assert(node->OperIs(GT_STORE_BLK));
+
+    unsigned  dstLclNum         = BAD_VAR_NUM;
+    regNumber dstAddrBaseReg    = REG_NA;
+    regNumber dstAddrIndexReg   = REG_NA;
+    unsigned  dstAddrIndexScale = 1;
+    int       dstOffset         = 0;
+    GenTree*  dstAddr           = node->Addr();
+
+    if (!dstAddr->isContained())
+    {
+        dstAddrBaseReg = genConsumeReg(dstAddr);
+    }
+    else if (dstAddr->OperIsAddrMode())
+    {
+        GenTreeAddrMode* addrMode = dstAddr->AsAddrMode();
+
+        if (addrMode->HasBase())
+        {
+            dstAddrBaseReg = genConsumeReg(addrMode->Base());
+        }
+
+        if (addrMode->HasIndex())
+        {
+            dstAddrIndexReg   = genConsumeReg(addrMode->Index());
+            dstAddrIndexScale = addrMode->GetScale();
+        }
+
+        dstOffset = addrMode->Offset();
+    }
+    else
+    {
+        assert(dstAddr->OperIsLocalAddr());
+        const GenTreeLclVarCommon* lclVar = dstAddr->AsLclVarCommon();
+        dstLclNum                         = lclVar->GetLclNum();
+        dstOffset                         = lclVar->GetLclOffs();
+    }
+
+    unsigned  srcLclNum         = BAD_VAR_NUM;
+    regNumber srcAddrBaseReg    = REG_NA;
+    regNumber srcAddrIndexReg   = REG_NA;
+    unsigned  srcAddrIndexScale = 1;
+    int       srcOffset         = 0;
+    GenTree*  src               = node->Data();
+
+    assert(src->isContained());
+
+    if (src->OperIs(GT_LCL_VAR, GT_LCL_FLD))
+    {
+        srcLclNum = src->AsLclVarCommon()->GetLclNum();
+        srcOffset = src->AsLclVarCommon()->GetLclOffs();
+    }
+    else
+    {
+        assert(src->OperIs(GT_IND));
+        GenTree* srcAddr = src->AsIndir()->Addr();
+
+        if (!srcAddr->isContained())
+        {
+            srcAddrBaseReg = genConsumeReg(srcAddr);
+        }
+        else if (srcAddr->OperIsAddrMode())
+        {
+            GenTreeAddrMode* addrMode = srcAddr->AsAddrMode();
+
+            if (addrMode->HasBase())
+            {
+                srcAddrBaseReg = genConsumeReg(addrMode->Base());
+            }
+
+            if (addrMode->HasIndex())
+            {
+                srcAddrIndexReg   = genConsumeReg(addrMode->Index());
+                srcAddrIndexScale = addrMode->GetScale();
+            }
+
+            srcOffset = addrMode->Offset();
+        }
+        else
+        {
+            assert(srcAddr->OperIsLocalAddr());
+            srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum();
+            srcOffset = srcAddr->AsLclVarCommon()->GetLclOffs();
+        }
+    }
+
+    emitter* emit = GetEmitter();
+    unsigned size = node->GetLayout()->GetSize();
+
+    assert(size <= INT32_MAX);
+    assert(srcOffset < (INT32_MAX - static_cast<int>(size)));
+    assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
+
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);
+
+        instruction simdMov = simdUnalignedMovIns();
+        for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
+             size -= regSize, srcOffset += regSize, dstOffset += regSize)
+        {
+            if (srcLclNum != BAD_VAR_NUM)
+            {
+                emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
+            }
+            else
+            {
+                emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
+                                    srcAddrIndexScale, srcOffset);
+            }
+
+            if (dstLclNum != BAD_VAR_NUM)
+            {
+                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
+            }
+            else
+            {
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
+                                    dstAddrIndexScale, dstOffset);
+            }
+        }
+
+        // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
+        // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
+        // allocate a GPR just for the remainder.
+    }
+
+    if (size > 0)
+    {
+        regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT);
+
+        for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize)
+        {
+            while (regSize > size)
+            {
+                regSize /= 2;
+            }
+
+            if (srcLclNum != BAD_VAR_NUM)
+            {
+                emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
+            }
+            else
+            {
+                emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
+                                    srcAddrIndexScale, srcOffset);
+            }
+
+            if (dstLclNum != BAD_VAR_NUM)
+            {
+                emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
+            }
+            else
+            {
+                emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
+                                    dstAddrIndexScale, dstOffset);
+            }
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------
+// genCodeForCpBlkRepMovs - Generate code for CpBlk by using rep movs
+//
+// Arguments:
+//    cpBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
+//
+// Preconditions:
+//   The register assignments have been set appropriately.
+//   This is validated by genConsumeBlockOp().
+//
+void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
+{
+    // Destination address goes in RDI, source address goes in RSE, and size goes in RCX.
+    // genConsumeBlockOp takes care of this for us.
+    genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
+    instGen(INS_r_movsb);
+}
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+//------------------------------------------------------------------------
+// CodeGen::genMove8IfNeeded: Conditionally move 8 bytes of a struct to the argument area
+//
+// Arguments:
+//    size       - The size of bytes remaining to be moved
+//    longTmpReg - The tmp register to be used for the long value
+//    srcAddr    - The address of the source struct
+//    offset     - The current offset being copied
+//
+// Return Value:
+//    Returns the number of bytes moved (8 or 0).
+//
+// Notes:
+//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
+//    not an even multiple of 16.
+//    On x86, longTmpReg must be an xmm reg; on x64 it must be an integer register.
+//    This is checked by genStoreRegToStackArg.
+//
+unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree* srcAddr, unsigned offset)
+{
+#ifdef TARGET_X86
+    instruction longMovIns = INS_movq;
+#else  // !TARGET_X86
+    instruction longMovIns = INS_mov;
+#endif // !TARGET_X86
+    if ((size & 8) != 0)
+    {
+        genCodeForLoadOffset(longMovIns, EA_8BYTE, longTmpReg, srcAddr, offset);
+        genStoreRegToStackArg(TYP_LONG, longTmpReg, offset);
+        return 8;
+    }
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// CodeGen::genMove4IfNeeded: Conditionally move 4 bytes of a struct to the argument area
+//
+// Arguments:
+//    size      - The size of bytes remaining to be moved
+//    intTmpReg - The tmp register to be used for the long value
+//    srcAddr   - The address of the source struct
+//    offset    - The current offset being copied
+//
+// Return Value:
+//    Returns the number of bytes moved (4 or 0).
+//
+// Notes:
+//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
+//    not an even multiple of 16.
+//    intTmpReg must be an integer register.
+//    This is checked by genStoreRegToStackArg.
+//
+unsigned CodeGen::genMove4IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
+{
+    if ((size & 4) != 0)
+    {
+        genCodeForLoadOffset(INS_mov, EA_4BYTE, intTmpReg, srcAddr, offset);
+        genStoreRegToStackArg(TYP_INT, intTmpReg, offset);
+        return 4;
+    }
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// CodeGen::genMove2IfNeeded: Conditionally move 2 bytes of a struct to the argument area
+//
+// Arguments:
+//    size      - The size of bytes remaining to be moved
+//    intTmpReg - The tmp register to be used for the long value
+//    srcAddr   - The address of the source struct
+//    offset    - The current offset being copied
+//
+// Return Value:
+//    Returns the number of bytes moved (2 or 0).
+//
+// Notes:
+//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
+//    not an even multiple of 16.
+//    intTmpReg must be an integer register.
+//    This is checked by genStoreRegToStackArg.
+//
+unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
+{
+    if ((size & 2) != 0)
+    {
+        genCodeForLoadOffset(INS_mov, EA_2BYTE, intTmpReg, srcAddr, offset);
+        genStoreRegToStackArg(TYP_SHORT, intTmpReg, offset);
+        return 2;
+    }
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// CodeGen::genMove1IfNeeded: Conditionally move 1 byte of a struct to the argument area
+//
+// Arguments:
+//    size      - The size of bytes remaining to be moved
+//    intTmpReg - The tmp register to be used for the long value
+//    srcAddr   - The address of the source struct
+//    offset    - The current offset being copied
+//
+// Return Value:
+//    Returns the number of bytes moved (1 or 0).
+//
+// Notes:
+//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
+//    not an even multiple of 16.
+//    intTmpReg must be an integer register.
+//    This is checked by genStoreRegToStackArg.
+//
+unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
+{
+    if ((size & 1) != 0)
+    {
+        genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
+        genStoreRegToStackArg(TYP_BYTE, intTmpReg, offset);
+        return 1;
+    }
+    return 0;
+}
+
+//---------------------------------------------------------------------------------------------------------------//
+// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
+//
+// Arguments:
+//     putArgNode  - the PutArgStk tree.
+//
+// Notes:
+//     m_stkArgVarNum must be set to the base var number, relative to which the by-val struct will be copied to the
+//     stack.
+//
+// TODO-Amd64-Unix: Try to share code with copyblk.
+//      Need refactoring of copyblk before it could be used for putarg_stk.
+//      The difference for now is that a putarg_stk contains its children, while cpyblk does not.
+//      This creates differences in code. After some significant refactoring it could be reused.
+//
+void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
+{
+    GenTree* src = putArgNode->AsOp()->gtOp1;
+    // We will never call this method for SIMD types, which are stored directly
+    // in genPutStructArgStk().
+    noway_assert(src->TypeGet() == TYP_STRUCT);
+
+    unsigned size = putArgNode->GetStackByteSize();
+    assert(size <= CPBLK_UNROLL_LIMIT);
+
+    emitter* emit         = GetEmitter();
+    unsigned putArgOffset = putArgNode->getArgOffset();
+
+    assert(src->isContained());
+
+    assert(src->gtOper == GT_OBJ);
+
+    if (src->AsOp()->gtOp1->isUsedFromReg())
+    {
+        genConsumeReg(src->AsOp()->gtOp1);
+    }
+
+    unsigned offset = 0;
+
+    regNumber xmmTmpReg  = REG_NA;
+    regNumber intTmpReg  = REG_NA;
+    regNumber longTmpReg = REG_NA;
+#ifdef TARGET_X86
+    // On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
+    // less than 16 bytes, we will just be using pushes
+    if (size >= 8)
+    {
+        xmmTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
+        longTmpReg = xmmTmpReg;
+    }
+    if ((size & 0x7) != 0)
+    {
+        intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
+    }
+#else  // !TARGET_X86
+    // On x64 we use an XMM register only for 16-byte chunks.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+        xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
+    }
+    if ((size & 0xf) != 0)
+    {
+        intTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLINT);
+        longTmpReg = intTmpReg;
+    }
+#endif // !TARGET_X86
+
+    // If the size of this struct is larger than 16 bytes
+    // let's use SSE2 to be able to do 16 byte at a time
+    // loads and stores.
+    if (size >= XMM_REGSIZE_BYTES)
+    {
+#ifdef TARGET_X86
+        assert(!m_pushStkArg);
+#endif // TARGET_X86
+        size_t slots = size / XMM_REGSIZE_BYTES;
+
+        assert(putArgNode->gtGetOp1()->isContained());
+        assert(putArgNode->gtGetOp1()->AsOp()->gtOper == GT_OBJ);
+
+        // TODO: In the below code the load and store instructions are for 16 bytes, but the
+        //          type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+        //          this probably needs to be changed.
+        while (slots-- > 0)
+        {
+            // Load
+            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
+
+            // Store
+            genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
+
+            offset += XMM_REGSIZE_BYTES;
+        }
+    }
+
+    // Fill the remainder (15 bytes or less) if there's one.
+    if ((size & 0xf) != 0)
+    {
+#ifdef TARGET_X86
+        if (m_pushStkArg)
+        {
+            // This case is currently supported only for the case where the total size is
+            // less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
+            // order. However, morph has ensured that we have a struct that is an even
+            // multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
+            assert(((size & 0xc) == size) && (offset == 0));
+            // If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
+            // whether we've got an 8 byte chunk, and then push it on the stack.
+            unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, size & 0x8);
+            // Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
+            // and push it on the stack.
+            pushedBytes += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, 0);
+        }
+        else
+#endif // TARGET_X86
+        {
+            offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset);
+            offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
+            offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
+            offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
+            assert(offset == size);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
+//
+// Arguments:
+//     putArgNode  - the PutArgStk tree.
+//
+// Preconditions:
+//     m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
+//
+void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
+{
+    GenTree* srcAddr = putArgNode->gtGetOp1();
+    assert(srcAddr->TypeGet() == TYP_STRUCT);
+
+    // Make sure we got the arguments of the cpblk operation in the right registers, and that
+    // 'srcAddr' is contained as expected.
+    assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
+    assert(srcAddr->isContained());
+
+    genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
+    instGen(INS_r_movsb);
+}
+
+//------------------------------------------------------------------------
+// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
+// must be cleared to zeroes. The native compiler doesn't clear the upper bits
+// and there is no way to know if the caller is native or not. So, the upper
+// 32 bits of Vector argument on stack are always cleared to zero.
+#if defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
+void CodeGen::genClearStackVec3ArgUpperBits()
+{
+#ifdef DEBUG
+    if (verbose)
+    {
+        printf("*************** In genClearStackVec3ArgUpperBits()\n");
+    }
+#endif
+
+    assert(compiler->compGeneratingProlog);
+
+    unsigned varNum = 0;
+
+    for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
+    {
+        LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
+        assert(varDsc->lvIsParam);
+
+        // Does var has simd12 type?
+        if (varDsc->lvType != TYP_SIMD12)
+        {
+            continue;
+        }
+
+        if (!varDsc->lvIsRegArg)
+        {
+            // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
+            GetEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
+        }
+        else
+        {
+            // Assume that for x64 linux, an argument is fully in registers
+            // or fully on stack.
+            regNumber argReg = varDsc->GetOtherArgReg();
+
+            // Clear the upper 32 bits by two shift instructions.
+            // argReg = argReg << 96
+            GetEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+            // argReg = argReg >> 96
+            GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
+        }
+    }
+}
+#endif // defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//
+// genCodeForCpObj - Generate code for CpObj nodes to copy structs that have interleaved
+//                   GC pointers.
+//
+// Arguments:
+//    cpObjNode - the GT_STORE_OBJ
+//
+// Notes:
+//    This will generate a sequence of movsp instructions for the cases of non-gc members.
+//    Note that movsp is an alias for movsd on x86 and movsq on x64.
+//    and calls to the BY_REF_ASSIGN helper otherwise.
+//
+// Preconditions:
+//    The register assignments have been set appropriately.
+//    This is validated by genConsumeBlockOp().
+//
+void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
+{
+    assert(false);
+}
+
+#ifdef TARGET_AMD64
+//----------------------------------------------------------------------------------
+// genCodeForCpBlkHelper - Generate code for a CpBlk node by the means of the VM memcpy helper call
+//
+// Arguments:
+//    cpBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
+//
+// Preconditions:
+//   The register assignments have been set appropriately.
+//   This is validated by genConsumeBlockOp().
+//
+void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
+{
+    // Destination address goes in arg0, source address goes in arg1, and size goes in arg2.
+    // genConsumeBlockOp takes care of this for us.
+    genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
+
+    genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
+}
+#endif // TARGET_AMD64
+
+// generate code do a switch statement based on a table of ip-relative offsets
+void CodeGen::genTableBasedSwitch(GenTree* treeNode)
+{
+    genConsumeOperands(treeNode->AsOp());
+    regNumber idxReg  = treeNode->AsOp()->gtOp1->GetRegNum();
+    regNumber baseReg = treeNode->AsOp()->gtOp2->GetRegNum();
+
+    regNumber tmpReg = treeNode->GetSingleTempReg();
+
+    // load the ip-relative offset (which is relative to start of fgFirstBB)
+    GetEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
+
+    // add it to the absolute address of fgFirstBB
+    compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
+    GetEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
+    GetEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
+    // jmp baseReg
+    GetEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
+}
+
+// emits the table and an instruction to get the address of the first element
+void CodeGen::genJumpTable(GenTree* treeNode)
+{
+    noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
+    assert(treeNode->OperGet() == GT_JMPTABLE);
+
+    unsigned     jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
+    BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
+    unsigned     jmpTabOffs;
+    unsigned     jmpTabBase;
+
+    jmpTabBase = GetEmitter()->emitBBTableDataGenBeg(jumpCount, true);
+
+    jmpTabOffs = 0;
+
+    JITDUMP("\n      J_M%03u_DS%02u LABEL   DWORD\n", compiler->compMethodID, jmpTabBase);
+
+    for (unsigned i = 0; i < jumpCount; i++)
+    {
+        BasicBlock* target = *jumpTable++;
+        noway_assert(target->bbFlags & BBF_JMP_TARGET);
+
+        JITDUMP("            DD      L_M%03u_" FMT_BB "\n", compiler->compMethodID, target->bbNum);
+
+        GetEmitter()->emitDataGenData(i, target);
+    };
+
+    GetEmitter()->emitDataGenEnd();
+
+    // Access to inline data is 'abstracted' by a special type of static member
+    // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
+    // to constant data, not a real static field.
+    GetEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->GetRegNum(),
+                              compiler->eeFindJitDataOffs(jmpTabBase), 0);
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCodeForLockAdd: Generate code for a GT_LOCKADD node
+//
+// Arguments:
+//    node - the GT_LOCKADD node
+//
+//void CodeGen::genCodeForLockAdd(GenTreeOp* node)
+//{
+//    assert(false);
+//}
+
+//------------------------------------------------------------------------
+// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
+//
+// Arguments:
+//    node - the GT_XADD/XCHG node
+//
+void CodeGen::genLockedInstructions(GenTreeOp* node)
+{
+    assert(node->OperIs(GT_XADD, GT_XCHG));
+
+    GenTree* addr = node->gtGetOp1();
+    GenTree* data = node->gtGetOp2();
+    emitAttr size = emitTypeSize(node->TypeGet());
+
+    assert(addr->isUsedFromReg());
+    assert(data->isUsedFromReg());
+    assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
+
+    genConsumeOperands(node);
+
+    if (node->GetRegNum() != data->GetRegNum())
+    {
+        // If the destination register is different from the data register then we need
+        // to first move the data to the target register. Make sure we don't overwrite
+        // the address, the register allocator should have taken care of this.
+        assert(node->GetRegNum() != addr->GetRegNum());
+        GetEmitter()->emitIns_R_R(INS_mov, size, node->GetRegNum(), data->GetRegNum());
+    }
+
+    instruction ins = node->OperIs(GT_XADD) ? INS_xadd : INS_xchg;
+
+    // XCHG has an implied lock prefix when the first operand is a memory operand.
+    if (ins != INS_xchg)
+    {
+        instGen(INS_lock);
+    }
+
+    GetEmitter()->emitIns_AR_R(ins, size, node->GetRegNum(), addr->GetRegNum(), 0);
+    genProduceReg(node);
+}
+
+//------------------------------------------------------------------------
+// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
+//
+// Arguments:
+//    tree - the GT_CMPXCHG node
+//
+void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
+{
+    assert(tree->OperIs(GT_CMPXCHG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->GetRegNum();
+
+    GenTree* location  = tree->gtOpLocation;  // arg1
+    GenTree* value     = tree->gtOpValue;     // arg2
+    GenTree* comparand = tree->gtOpComparand; // arg3
+
+    assert(location->GetRegNum() != REG_NA && location->GetRegNum() != REG_RAX);
+    assert(value->GetRegNum() != REG_NA && value->GetRegNum() != REG_RAX);
+
+    genConsumeReg(location);
+    genConsumeReg(value);
+    genConsumeReg(comparand);
+
+    // comparand goes to RAX;
+    // Note that we must issue this move after the genConsumeRegs(), in case any of the above
+    // have a GT_COPY from RAX.
+    if (comparand->GetRegNum() != REG_RAX)
+    {
+        inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->GetRegNum(), comparand->TypeGet());
+    }
+
+    // location is Rm
+    instGen(INS_lock);
+
+    GetEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->GetRegNum(), location->GetRegNum(), 0);
+
+    // Result is in RAX
+    if (targetReg != REG_RAX)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
+    }
+
+    genProduceReg(tree);
+}
+
+// generate code for BoundsCheck nodes
+void CodeGen::genRangeCheck(GenTree* oper)
+{
+    noway_assert(oper->OperIsBoundsCheck());
+    GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
+
+    GenTree* arrIndex = bndsChk->gtIndex;
+    GenTree* arrLen   = bndsChk->gtArrLen;
+
+    GenTree *    src1, *src2;
+    emitJumpKind jmpKind;
+    instruction  cmpKind;
+
+    genConsumeRegs(arrIndex);
+    genConsumeRegs(arrLen);
+
+    if (arrIndex->IsIntegralConst(0) && arrLen->isUsedFromReg())
+    {
+        // arrIndex is 0 and arrLen is in a reg. In this case
+        // we can generate
+        //      test reg, reg
+        // since arrLen is non-negative
+        src1    = arrLen;
+        src2    = arrLen;
+        jmpKind = EJ_je;
+        cmpKind = INS_test;
+    }
+    else if (arrIndex->isContainedIntOrIImmed())
+    {
+        // arrIndex is a contained constant.  In this case
+        // we will generate one of the following
+        //      cmp [mem], immed    (if arrLen is a memory op)
+        //      cmp reg, immed      (if arrLen is in a reg)
+        //
+        // That is arrLen cannot be a contained immed.
+        assert(!arrLen->isContainedIntOrIImmed());
+
+        src1    = arrLen;
+        src2    = arrIndex;
+        jmpKind = EJ_jbe;
+        cmpKind = INS_cmp;
+    }
+    else
+    {
+        // arrIndex could either be a contained memory op or a reg
+        // In this case we will generate one of the following
+        //      cmp  [mem], immed   (if arrLen is a constant)
+        //      cmp  [mem], reg     (if arrLen is in a reg)
+        //      cmp  reg, immed     (if arrIndex is in a reg)
+        //      cmp  reg1, reg2     (if arrIndex is in reg1)
+        //      cmp  reg, [mem]     (if arrLen is a memory op)
+        //
+        // That is only one of arrIndex or arrLen can be a memory op.
+        assert(!arrIndex->isUsedFromMemory() || !arrLen->isUsedFromMemory());
+
+        src1    = arrIndex;
+        src2    = arrLen;
+        jmpKind = EJ_jae;
+        cmpKind = INS_cmp;
+    }
+
+    var_types bndsChkType = src2->TypeGet();
+#if DEBUG
+    // Bounds checks can only be 32 or 64 bit sized comparisons.
+    assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
+
+    // The type of the bounds check should always wide enough to compare against the index.
+    assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
+#endif // DEBUG
+
+    GetEmitter()->emitInsBinary(cmpKind, emitTypeSize(bndsChkType), src1, src2);
+    genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
+}
+
+//---------------------------------------------------------------------
+// genCodeForPhysReg - generate code for a GT_PHYSREG node
+//
+// Arguments
+//    tree - the GT_PHYSREG node
+//
+// Return value:
+//    None
+//
+void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
+{
+    assert(tree->OperIs(GT_PHYSREG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->GetRegNum();
+
+    if (targetReg != tree->gtSrcReg)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
+        genTransferRegGCState(targetReg, tree->gtSrcReg);
+    }
+
+    genProduceReg(tree);
+}
+
+//---------------------------------------------------------------------
+// genCodeForNullCheck - generate code for a GT_NULLCHECK node
+//
+// Arguments
+//    tree - the GT_NULLCHECK node
+//
+// Return value:
+//    None
+//
+void CodeGen::genCodeForNullCheck(GenTreeIndir* tree)
+{
+    assert(tree->OperIs(GT_NULLCHECK));
+
+    assert(tree->gtOp1->isUsedFromReg());
+    regNumber reg = genConsumeReg(tree->gtOp1);
+    GetEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
+//   lower bound for the given dimension.
+//
+// Arguments:
+//    elemType  - the element type of the array
+//    rank      - the rank of the array
+//    dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+//    The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
+{
+    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
+}
+
+//------------------------------------------------------------------------
+// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
+//   size for the given dimension.
+//
+// Arguments:
+//    elemType  - the element type of the array
+//    rank      - the rank of the array
+//    dimension - the dimension for which the lower bound offset will be returned.
+//
+// Return Value:
+//    The offset.
+
+unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
+{
+    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
+    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
+//                     producing the effective index by subtracting the lower bound.
+//
+// Arguments:
+//    arrIndex - the node for which we're generating code
+//
+// Return Value:
+//    None.
+//
+
+void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
+{
+    GenTree* arrObj    = arrIndex->ArrObj();
+    GenTree* indexNode = arrIndex->IndexExpr();
+
+    regNumber arrReg   = genConsumeReg(arrObj);
+    regNumber indexReg = genConsumeReg(indexNode);
+    regNumber tgtReg   = arrIndex->GetRegNum();
+
+    unsigned  dim      = arrIndex->gtCurrDim;
+    unsigned  rank     = arrIndex->gtArrRank;
+    var_types elemType = arrIndex->gtArrElemType;
+
+    noway_assert(tgtReg != REG_NA);
+
+    // Subtract the lower bound for this dimension.
+    // TODO-XArch-CQ: make this contained if it's an immediate that fits.
+    if (tgtReg != indexReg)
+    {
+        inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
+    }
+    GetEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+                               genOffsetOfMDArrayLowerBound(elemType, rank, dim));
+    GetEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
+                               genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+    genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
+
+    genProduceReg(arrIndex);
+}
+
+//------------------------------------------------------------------------
+// genCodeForArrOffset: Generates code to compute the flattened array offset for
+//    one dimension of an array reference:
+//        result = (prevDimOffset * dimSize) + effectiveIndex
+//    where dimSize is obtained from the arrObj operand
+//
+// Arguments:
+//    arrOffset - the node for which we're generating code
+//
+// Return Value:
+//    None.
+//
+// Notes:
+//    dimSize and effectiveIndex are always non-negative, the former by design,
+//    and the latter because it has been normalized to be zero-based.
+
+void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
+{
+    GenTree* offsetNode = arrOffset->gtOffset;
+    GenTree* indexNode  = arrOffset->gtIndex;
+    GenTree* arrObj     = arrOffset->gtArrObj;
+
+    regNumber tgtReg = arrOffset->GetRegNum();
+    assert(tgtReg != REG_NA);
+
+    unsigned  dim      = arrOffset->gtCurrDim;
+    unsigned  rank     = arrOffset->gtArrRank;
+    var_types elemType = arrOffset->gtArrElemType;
+
+    // First, consume the operands in the correct order.
+    regNumber offsetReg = REG_NA;
+    regNumber tmpReg    = REG_NA;
+    if (!offsetNode->IsIntegralConst(0))
+    {
+        offsetReg = genConsumeReg(offsetNode);
+
+        // We will use a temp register for the offset*scale+effectiveIndex computation.
+        tmpReg = arrOffset->GetSingleTempReg();
+    }
+    else
+    {
+        assert(offsetNode->isContained());
+    }
+    regNumber indexReg = genConsumeReg(indexNode);
+    // Although arrReg may not be used in the constant-index case, if we have generated
+    // the value into a register, we must consume it, otherwise we will fail to end the
+    // live range of the gc ptr.
+    // TODO-CQ: Currently arrObj will always have a register allocated to it.
+    // We could avoid allocating a register for it, which would be of value if the arrObj
+    // is an on-stack lclVar.
+    regNumber arrReg = REG_NA;
+    if (arrObj->gtHasReg())
+    {
+        arrReg = genConsumeReg(arrObj);
+    }
+
+    if (!offsetNode->IsIntegralConst(0))
+    {
+        assert(tmpReg != REG_NA);
+        assert(arrReg != REG_NA);
+
+        // Evaluate tgtReg = offsetReg*dim_size + indexReg.
+        // tmpReg is used to load dim_size and the result of the multiplication.
+        // Note that dim_size will never be negative.
+
+        GetEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
+                                   genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
+        inst_RV_RV(INS_imul, tmpReg, offsetReg);
+
+        if (tmpReg == tgtReg)
+        {
+            inst_RV_RV(INS_add, tmpReg, indexReg);
+        }
+        else
+        {
+            if (indexReg != tgtReg)
+            {
+                inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
+            }
+            inst_RV_RV(INS_add, tgtReg, tmpReg);
+        }
+    }
+    else
+    {
+        if (indexReg != tgtReg)
+        {
+            inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
+        }
+    }
+    genProduceReg(arrOffset);
+}
+
+instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
+{
+    instruction ins;
+
+    // Operations on SIMD vectors shouldn't come this path
+    assert(!varTypeIsSIMD(type));
+    if (varTypeIsFloating(type))
+    {
+        return ins_MathOp(oper, type);
+    }
+
+    switch (oper)
+    {
+        case GT_ADD:
+            ins = INS_add;
+            break;
+        case GT_AND:
+            ins = INS_and;
+            break;
+        case GT_LSH:
+            ins = INS_shl;
+            break;
+        case GT_MUL:
+            ins = INS_imul;
+            break;
+        case GT_NEG:
+            ins = INS_neg;
+            break;
+        case GT_NOT:
+            ins = INS_not;
+            break;
+        case GT_OR:
+            ins = INS_or;
+            break;
+        case GT_ROL:
+            ins = INS_rol;
+            break;
+        case GT_ROR:
+            ins = INS_ror;
+            break;
+        case GT_RSH:
+            ins = INS_sar;
+            break;
+        case GT_RSZ:
+            ins = INS_shr;
+            break;
+        case GT_SUB:
+            ins = INS_sub;
+            break;
+        case GT_XOR:
+            ins = INS_xor;
+            break;
+#if !defined(TARGET_64BIT)
+        case GT_ADD_LO:
+            ins = INS_add;
+            break;
+        case GT_ADD_HI:
+            ins = INS_adc;
+            break;
+        case GT_SUB_LO:
+            ins = INS_sub;
+            break;
+        case GT_SUB_HI:
+            ins = INS_sbb;
+            break;
+        case GT_LSH_HI:
+            ins = INS_shld;
+            break;
+        case GT_RSH_LO:
+            ins = INS_shrd;
+            break;
+#endif // !defined(TARGET_64BIT)
+        default:
+            unreached();
+            break;
+    }
+    return ins;
+}
+
+//------------------------------------------------------------------------
+// genCodeForShift: Generates the code sequence for a GenTree node that
+// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
+//
+// Arguments:
+//    tree - the bit shift node (that specifies the type of bit shift to perform).
+//
+// Assumptions:
+//    a) All GenTrees are register allocated.
+//    b) The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or
+//       it's a register-allocated expression. If it is in a register that is
+//       not RCX, it will be moved to RCX (so RCX better not be in use!).
+//
+void CodeGen::genCodeForShift(GenTree* tree)
+{
+    // Only the non-RMW case here.
+    assert(tree->OperIsShiftOrRotate());
+    assert(tree->AsOp()->gtOp1->isUsedFromReg());
+    assert(tree->GetRegNum() != REG_NA);
+
+    genConsumeOperands(tree->AsOp());
+
+    var_types   targetType = tree->TypeGet();
+    instruction ins        = genGetInsForOper(tree->OperGet(), targetType);
+
+    GenTree*  operand    = tree->gtGetOp1();
+    regNumber operandReg = operand->GetRegNum();
+
+    GenTree* shiftBy = tree->gtGetOp2();
+
+    if (shiftBy->isContainedIntOrIImmed())
+    {
+        emitAttr size = emitTypeSize(tree);
+
+        // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg"
+        if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1))
+        {
+            if (tree->GetRegNum() == operandReg)
+            {
+                GetEmitter()->emitIns_R_R(INS_add, size, tree->GetRegNum(), operandReg);
+            }
+            else
+            {
+                GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0);
+            }
+        }
+        else
+        {
+            int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
+
+#if defined(TARGET_64BIT)
+            // Try to emit rorx if BMI2 is available instead of mov+rol
+            // it makes sense only for 64bit integers
+            if ((genActualType(targetType) == TYP_LONG) && (tree->GetRegNum() != operandReg) &&
+                compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && tree->OperIs(GT_ROL, GT_ROR) &&
+                (shiftByValue > 0) && (shiftByValue < 64))
+            {
+                const int value = tree->OperIs(GT_ROL) ? (64 - shiftByValue) : shiftByValue;
+                GetEmitter()->emitIns_R_R_I(INS_rorx, size, tree->GetRegNum(), operandReg, value);
+                genProduceReg(tree);
+                return;
+            }
+#endif
+            // First, move the operand to the destination register and
+            // later on perform the shift in-place.
+            // (LSRA will try to avoid this situation through preferencing.)
+            if (tree->GetRegNum() != operandReg)
+            {
+                inst_RV_RV(INS_mov, tree->GetRegNum(), operandReg, targetType);
+            }
+            inst_RV_SH(ins, size, tree->GetRegNum(), shiftByValue);
+        }
+    }
+    else
+    {
+        // We must have the number of bits to shift stored in ECX, since we constrained this node to
+        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
+        // register destination requirement.
+        genCopyRegIfNeeded(shiftBy, REG_RCX);
+
+        // The operand to be shifted must not be in ECX
+        noway_assert(operandReg != REG_RCX);
+
+        if (tree->GetRegNum() != operandReg)
+        {
+            inst_RV_RV(INS_mov, tree->GetRegNum(), operandReg, targetType);
+        }
+        inst_RV_CL(ins, tree->GetRegNum(), targetType);
+    }
+
+    genProduceReg(tree);
+}
+
+#ifdef TARGET_X86
+//------------------------------------------------------------------------
+// genCodeForShiftLong: Generates the code sequence for a GenTree node that
+// represents a three operand bit shift or rotate operation (<<Hi, >>Lo).
+//
+// Arguments:
+//    tree - the bit shift node (that specifies the type of bit shift to perform).
+//
+// Assumptions:
+//    a) All GenTrees are register allocated.
+//    b) The shift-by-amount in tree->AsOp()->gtOp2 is a contained constant
+//
+// TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
+// need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
+// targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
+// contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
+//
+void CodeGen::genCodeForShiftLong(GenTree* tree)
+{
+    // Only the non-RMW case here.
+    genTreeOps oper = tree->OperGet();
+    assert(oper == GT_LSH_HI || oper == GT_RSH_LO);
+
+    GenTree* operand = tree->AsOp()->gtOp1;
+    assert(operand->OperGet() == GT_LONG);
+    assert(operand->AsOp()->gtOp1->isUsedFromReg());
+    assert(operand->AsOp()->gtOp2->isUsedFromReg());
+
+    GenTree* operandLo = operand->gtGetOp1();
+    GenTree* operandHi = operand->gtGetOp2();
+
+    regNumber regLo = operandLo->GetRegNum();
+    regNumber regHi = operandHi->GetRegNum();
+
+    genConsumeOperands(tree->AsOp());
+
+    var_types   targetType = tree->TypeGet();
+    instruction ins        = genGetInsForOper(oper, targetType);
+
+    GenTree* shiftBy = tree->gtGetOp2();
+
+    assert(shiftBy->isContainedIntOrIImmed());
+
+    unsigned int count = (unsigned int)shiftBy->AsIntConCommon()->IconValue();
+
+    regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
+
+    if (regResult != tree->GetRegNum())
+    {
+        inst_RV_RV(INS_mov, tree->GetRegNum(), regResult, targetType);
+    }
+
+    if (oper == GT_LSH_HI)
+    {
+        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->GetRegNum(), regLo, count);
+    }
+    else
+    {
+        assert(oper == GT_RSH_LO);
+        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->GetRegNum(), regHi, count);
+    }
+
+    genProduceReg(tree);
+}
+#endif
+
+//------------------------------------------------------------------------
+// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
+// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
+//      GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
+//
+// Arguments:
+//    storeIndNode: the GT_STOREIND node.
+//
+//void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
+//{
+//    assert(false);
+//}
+
+//------------------------------------------------------------------------
+// genCodeForLclFld: Produce code for a GT_LCL_FLD node.
+//
+// Arguments:
+//    tree - the GT_LCL_FLD node
+//
+void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
+{
+    assert(tree->OperIs(GT_LCL_FLD));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->GetRegNum();
+
+    noway_assert(targetReg != REG_NA);
+
+#ifdef FEATURE_SIMD
+    // Loading of TYP_SIMD12 (i.e. Vector3) field
+    if (targetType == TYP_SIMD12)
+    {
+        genLoadLclTypeSIMD12(tree);
+        return;
+    }
+#endif
+
+    noway_assert(targetType != TYP_STRUCT);
+
+    emitAttr size   = emitTypeSize(targetType);
+    unsigned offs   = tree->GetLclOffs();
+    unsigned varNum = tree->GetLclNum();
+    assert(varNum < compiler->lvaCount);
+
+    GetEmitter()->emitIns_R_S(ins_Load(targetType), size, targetReg, varNum, offs);
+
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
+//
+// Arguments:
+//    tree - the GT_LCL_VAR node
+//
+void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
+{
+    assert(tree->OperIs(GT_LCL_VAR));
+
+    // lcl_vars are not defs
+    assert((tree->gtFlags & GTF_VAR_DEF) == 0);
+
+    LclVarDsc* varDsc         = compiler->lvaGetDesc(tree);
+    bool       isRegCandidate = varDsc->lvIsRegCandidate();
+
+    // If this is a register candidate that has been spilled, genConsumeReg() will
+    // reload it at the point of use.  Otherwise, if it's not in a register, we load it here.
+
+    if (!isRegCandidate && !tree->IsMultiReg() && !(tree->gtFlags & GTF_SPILLED))
+    {
+#if defined(FEATURE_SIMD) && defined(TARGET_X86)
+        // Loading of TYP_SIMD12 (i.e. Vector3) variable
+        if (tree->TypeGet() == TYP_SIMD12)
+        {
+            genLoadLclTypeSIMD12(tree);
+            return;
+        }
+#endif // defined(FEATURE_SIMD) && defined(TARGET_X86)
+
+        var_types type = varDsc->GetRegisterType(tree);
+        GetEmitter()->emitIns_R_S(ins_Load(type, compiler->isSIMDTypeLocalAligned(tree->GetLclNum())),
+                                  emitTypeSize(type), tree->GetRegNum(), tree->GetLclNum(), 0);
+        genProduceReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
+//
+// Arguments:
+//    tree - the GT_STORE_LCL_FLD node
+//
+void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
+{
+    assert(tree->OperIs(GT_STORE_LCL_FLD));
+
+    var_types targetType = tree->TypeGet();
+    GenTree*  op1        = tree->gtGetOp1();
+
+    noway_assert(targetType != TYP_STRUCT);
+
+#ifdef FEATURE_SIMD
+    // storing of TYP_SIMD12 (i.e. Vector3) field
+    if (tree->TypeGet() == TYP_SIMD12)
+    {
+        genStoreLclTypeSIMD12(tree);
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    assert(varTypeUsesFloatReg(targetType) == varTypeUsesFloatReg(op1));
+    assert(genTypeSize(genActualType(targetType)) == genTypeSize(genActualType(op1->TypeGet())));
+
+    genConsumeRegs(op1);
+    GetEmitter()->emitInsBinary(ins_Store(targetType), emitTypeSize(tree), tree, op1);
+
+    // Updating variable liveness after instruction was emitted
+    genUpdateLife(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
+//
+// Arguments:
+//    lclNode - the GT_STORE_LCL_VAR node
+//
+void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode)
+{
+    assert(lclNode->OperIs(GT_STORE_LCL_VAR));
+
+    regNumber targetReg = lclNode->GetRegNum();
+    emitter*  emit      = GetEmitter();
+
+    GenTree* op1 = lclNode->gtGetOp1();
+
+    // Stores from a multi-reg source are handled separately.
+    if (op1->gtSkipReloadOrCopy()->IsMultiRegNode())
+    {
+        genMultiRegStoreToLocal(lclNode);
+    }
+    else
+    {
+        unsigned   lclNum = lclNode->GetLclNum();
+        LclVarDsc* varDsc = compiler->lvaGetDesc(lclNum);
+
+        var_types targetType = varDsc->GetRegisterType(lclNode);
+
+#ifdef DEBUG
+        var_types op1Type = op1->TypeGet();
+        if (op1Type == TYP_STRUCT)
+        {
+            assert(op1->IsLocal());
+            GenTreeLclVar* op1LclVar = op1->AsLclVar();
+            unsigned       op1lclNum = op1LclVar->GetLclNum();
+            LclVarDsc*     op1VarDsc = compiler->lvaGetDesc(op1lclNum);
+            op1Type                  = op1VarDsc->GetRegisterType(op1LclVar);
+        }
+        assert(varTypeUsesFloatReg(targetType) == varTypeUsesFloatReg(op1Type));
+        assert(!varTypeUsesFloatReg(targetType) || (emitTypeSize(targetType) == emitTypeSize(op1Type)));
+#endif
+
+#if !defined(TARGET_64BIT)
+        if (targetType == TYP_LONG)
+        {
+            genStoreLongLclVar(lclNode);
+            return;
+        }
+#endif // !defined(TARGET_64BIT)
+
+#ifdef FEATURE_SIMD
+        // storing of TYP_SIMD12 (i.e. Vector3) field
+        if (targetType == TYP_SIMD12)
+        {
+            genStoreLclTypeSIMD12(lclNode);
+            return;
+        }
+#endif // FEATURE_SIMD
+
+        genConsumeRegs(op1);
+
+        if (op1->OperIs(GT_BITCAST) && op1->isContained())
+        {
+            GenTree*  bitCastSrc = op1->gtGetOp1();
+            var_types srcType    = bitCastSrc->TypeGet();
+            noway_assert(!bitCastSrc->isContained());
+            if (targetReg == REG_NA)
+            {
+                emit->emitIns_S_R(ins_Store(srcType, compiler->isSIMDTypeLocalAligned(lclNum)),
+                                  emitTypeSize(targetType), bitCastSrc->GetRegNum(), lclNum, 0);
+                genUpdateLife(lclNode);
+                varDsc->SetRegNum(REG_STK);
+            }
+            else
+            {
+                genBitCast(targetType, targetReg, srcType, bitCastSrc->GetRegNum());
+            }
+        }
+        else if (targetReg == REG_NA)
+        {
+            // stack store
+            emit->emitInsStoreLcl(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
+                                  emitTypeSize(targetType), lclNode);
+            varDsc->SetRegNum(REG_STK);
+        }
+        else
+        {
+            // Look for the case where we have a constant zero which we've marked for reuse,
+            // but which isn't actually in the register we want.  In that case, it's better to create
+            // zero in the target register, because an xor is smaller than a copy. Note that we could
+            // potentially handle this in the register allocator, but we can't always catch it there
+            // because the target may not have a register allocated for it yet.
+            if (op1->isUsedFromReg() && (op1->GetRegNum() != targetReg) && (op1->IsIntegralConst(0) || op1->IsFPZero()))
+            {
+                op1->SetRegNum(REG_NA);
+                op1->ResetReuseRegVal();
+                op1->SetContained();
+            }
+
+            if (!op1->isUsedFromReg())
+            {
+                // Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
+                // must be a constant. However, in the future we might want to support an operand used from
+                // memory.  This is a bit tricky because we have to decide it can be used from memory before
+                // register allocation,
+                // and this would be a case where, once that's done, we need to mark that node as always
+                // requiring a register - which we always assume now anyway, but once we "optimize" that
+                // we'll have to take cases like this into account.
+                assert((op1->GetRegNum() == REG_NA) && op1->OperIsConst());
+                genSetRegToConst(targetReg, targetType, op1);
+            }
+            else if (op1->GetRegNum() != targetReg)
+            {
+                assert(op1->GetRegNum() != REG_NA);
+                emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(lclNode), lclNode, op1);
+            }
+        }
+        if (targetReg != REG_NA)
+        {
+            genProduceReg(lclNode);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// genCodeForIndexAddr: Produce code for a GT_INDEX_ADDR node.
+//
+// Arguments:
+//    tree - the GT_INDEX_ADDR node
+//
+void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// genCodeForIndir: Produce code for a GT_IND node.
+//
+// Arguments:
+//    tree - the GT_IND node
+//
+void CodeGen::genCodeForIndir(GenTreeIndir* tree)
+{
+    assert(tree->OperIs(GT_IND));
+
+#ifdef FEATURE_SIMD
+    // Handling of Vector3 type values loaded through indirection.
+    if (tree->TypeGet() == TYP_SIMD12)
+    {
+        genLoadIndTypeSIMD12(tree);
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    var_types targetType = tree->TypeGet();
+    emitter*  emit       = GetEmitter();
+
+    GenTree* addr = tree->Addr();
+    if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
+    {
+        noway_assert(EA_ATTR(genTypeSize(targetType)) == EA_PTRSIZE);
+        emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, tree->GetRegNum(), FLD_GLOBAL_FS,
+                          (int)addr->AsIntCon()->gtIconVal);
+    }
+    else
+    {
+        genConsumeAddress(addr);
+        emit->emitInsLoadInd(ins_Load(targetType), emitTypeSize(tree), tree->GetRegNum(), tree);
+    }
+
+    genProduceReg(tree);
+}
+
+//------------------------------------------------------------------------
+// genCodeForStoreInd: Produce code for a GT_STOREIND node.
+//
+// Arguments:
+//    tree - the GT_STOREIND node
+//
+void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// genCodeForSwap: Produce code for a GT_SWAP node.
+//
+// Arguments:
+//    tree - the GT_SWAP node
+//
+void CodeGen::genCodeForSwap(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_SWAP));
+
+    // Swap is only supported for lclVar operands that are enregistered
+    // We do not consume or produce any registers.  Both operands remain enregistered.
+    // However, the gc-ness may change.
+    assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
+
+    GenTreeLclVarCommon* lcl1    = tree->gtOp1->AsLclVarCommon();
+    LclVarDsc*           varDsc1 = &(compiler->lvaTable[lcl1->GetLclNum()]);
+    var_types            type1   = varDsc1->TypeGet();
+    GenTreeLclVarCommon* lcl2    = tree->gtOp2->AsLclVarCommon();
+    LclVarDsc*           varDsc2 = &(compiler->lvaTable[lcl2->GetLclNum()]);
+    var_types            type2   = varDsc2->TypeGet();
+
+    // We must have both int or both fp regs
+    assert(!varTypeUsesFloatReg(type1) || varTypeUsesFloatReg(type2));
+
+    // FP swap is not yet implemented (and should have NYI'd in LSRA)
+    assert(!varTypeUsesFloatReg(type1));
+
+    regNumber oldOp1Reg     = lcl1->GetRegNum();
+    regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
+    regNumber oldOp2Reg     = lcl2->GetRegNum();
+    regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
+
+    // We don't call genUpdateVarReg because we don't have a tree node with the new register.
+    varDsc1->SetRegNum(oldOp2Reg);
+    varDsc2->SetRegNum(oldOp1Reg);
+
+    // Do the xchg
+    emitAttr size = EA_PTRSIZE;
+    if (varTypeGCtype(type1) != varTypeGCtype(type2))
+    {
+        // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
+        // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
+        size = EA_GCREF;
+    }
+    inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
+
+    // Update the gcInfo.
+    // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
+    gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+    gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
+
+    // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
+    // It will also dump the updates.
+    gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
+    gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
+}
+
+//------------------------------------------------------------------------
+// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
+// helper functions.
+//
+// Arguments:
+//    writeBarrierForm - the write barrier form to use
+//    addr - the address at which to do the store
+//    data - the data to store
+//
+// Return Value:
+//    true if an optimized write barrier form was used, false if not. If this
+//    function returns false, the caller must emit a "standard" write barrier.
+
+bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
+{
+    assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
+
+#if defined(TARGET_X86) && NOGC_WRITE_BARRIERS
+    if (!genUseOptimizedWriteBarriers(writeBarrierForm))
+    {
+        return false;
+    }
+
+    const static int regToHelper[2][8] = {
+        // If the target is known to be in managed memory
+        {
+            CORINFO_HELP_ASSIGN_REF_EAX, // EAX
+            CORINFO_HELP_ASSIGN_REF_ECX, // ECX
+            -1,                          // EDX (always the target address)
+            CORINFO_HELP_ASSIGN_REF_EBX, // EBX
+            -1,                          // ESP
+            CORINFO_HELP_ASSIGN_REF_EBP, // EBP
+            CORINFO_HELP_ASSIGN_REF_ESI, // ESI
+            CORINFO_HELP_ASSIGN_REF_EDI, // EDI
+        },
+
+        // Don't know if the target is in managed memory
+        {
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, // EAX
+            CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, // ECX
+            -1,                                  // EDX (always the target address)
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, // EBX
+            -1,                                  // ESP
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EBP, // EBP
+            CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, // ESI
+            CORINFO_HELP_CHECKED_ASSIGN_REF_EDI, // EDI
+        },
+    };
+
+    noway_assert(regToHelper[0][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
+    noway_assert(regToHelper[0][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
+    noway_assert(regToHelper[0][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
+    noway_assert(regToHelper[0][REG_ESP] == -1);
+    noway_assert(regToHelper[0][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
+    noway_assert(regToHelper[0][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
+    noway_assert(regToHelper[0][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
+
+    noway_assert(regToHelper[1][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
+    noway_assert(regToHelper[1][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
+    noway_assert(regToHelper[1][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
+    noway_assert(regToHelper[1][REG_ESP] == -1);
+    noway_assert(regToHelper[1][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
+    noway_assert(regToHelper[1][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
+    noway_assert(regToHelper[1][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
+
+    regNumber reg = data->GetRegNum();
+    noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
+
+    // Generate the following code:
+    //            lea     edx, addr
+    //            call    write_barrier_helper_reg
+
+    // addr goes in REG_ARG_0
+    genCopyRegIfNeeded(addr, REG_WRITE_BARRIER);
+
+    unsigned tgtAnywhere = 0;
+    if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
+    {
+        tgtAnywhere = 1;
+    }
+
+    // We might want to call a modified version of genGCWriteBarrier() to get the benefit of
+    // the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
+    // with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
+
+    genEmitHelperCall(regToHelper[tgtAnywhere][reg],
+                      0,           // argSize
+                      EA_PTRSIZE); // retSize
+
+    return true;
+#else  // !defined(TARGET_X86) || !NOGC_WRITE_BARRIERS
+    return false;
+#endif // !defined(TARGET_X86) || !NOGC_WRITE_BARRIERS
+}
+
+// Produce code for a GT_CALL node
+void CodeGen::genCallInstruction(GenTreeCall* call)
+{
+    assert(false);
+
+}
+
+// Produce code for a GT_JMP node.
+// The arguments of the caller needs to be transferred to the callee before exiting caller.
+// The actual jump to callee is generated as part of caller epilog sequence.
+// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
+void CodeGen::genJmpMethod(GenTree* jmp)
+{
+    assert(jmp->OperGet() == GT_JMP);
+    assert(compiler->compJmpOpUsed);
+
+    // If no arguments, nothing to do
+    if (compiler->info.compArgsCount == 0)
+    {
+        return;
+    }
+
+    // Make sure register arguments are in their initial registers
+    // and stack arguments are put back as well.
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    // First move any en-registered stack arguments back to the stack.
+    // At the same time any reg arg not in correct reg is moved back to its stack location.
+    //
+    // We are not strictly required to spill reg args that are not in the desired reg for a jmp call
+    // But that would require us to deal with circularity while moving values around.  Spilling
+    // to stack makes the implementation simple, which is not a bad trade off given Jmp calls
+    // are not frequent.
+    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+    {
+        varDsc = compiler->lvaTable + varNum;
+
+        if (varDsc->lvPromoted)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+            unsigned fieldVarNum = varDsc->lvFieldLclStart;
+            varDsc               = compiler->lvaTable + fieldVarNum;
+        }
+        noway_assert(varDsc->lvIsParam);
+
+        if (varDsc->lvIsRegArg && (varDsc->GetRegNum() != REG_STK))
+        {
+            // Skip reg args which are already in its right register for jmp call.
+            // If not, we will spill such args to their stack locations.
+            //
+            // If we need to generate a tail call profiler hook, then spill all
+            // arg regs to free them up for the callback.
+            if (!compiler->compIsProfilerHookNeeded() && (varDsc->GetRegNum() == varDsc->GetArgReg()))
+            {
+                continue;
+            }
+        }
+        else if (varDsc->GetRegNum() == REG_STK)
+        {
+            // Skip args which are currently living in stack.
+            continue;
+        }
+
+        // If we came here it means either a reg argument not in the right register or
+        // a stack argument currently living in a register.  In either case the following
+        // assert should hold.
+        assert(varDsc->GetRegNum() != REG_STK);
+
+        assert(!varDsc->lvIsStructField || (compiler->lvaTable[varDsc->lvParentLcl].lvFieldCnt == 1));
+        var_types storeType = genActualType(varDsc->lvaArgType()); // We own the memory and can use the full move.
+        GetEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), varDsc->GetRegNum(), varNum, 0);
+
+        // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
+        // Note that we cannot modify varDsc->GetRegNum() here because another basic block may not be expecting it.
+        // Therefore manually update life of varDsc->GetRegNum().
+        regMaskTP tempMask = varDsc->lvRegMask();
+        regSet.RemoveMaskVars(tempMask);
+        gcInfo.gcMarkRegSetNpt(tempMask);
+        if (compiler->lvaIsGCTracked(varDsc))
+        {
+#ifdef DEBUG
+            if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
+            }
+            else
+            {
+                JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
+            }
+#endif // DEBUG
+
+            VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+        }
+    }
+
+#ifdef PROFILING_SUPPORTED
+    // At this point all arg regs are free.
+    // Emit tail call profiler callback.
+    genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
+#endif
+
+    // Next move any un-enregistered register arguments back to their register.
+    regMaskTP fixedIntArgMask = RBM_NONE;    // tracks the int arg regs occupying fixed args in case of a vararg method.
+    unsigned  firstArgVarNum  = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
+    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
+    {
+        varDsc = compiler->lvaTable + varNum;
+        if (varDsc->lvPromoted)
+        {
+            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
+
+            unsigned fieldVarNum = varDsc->lvFieldLclStart;
+            varDsc               = compiler->lvaTable + fieldVarNum;
+        }
+        noway_assert(varDsc->lvIsParam);
+
+        // Skip if arg not passed in a register.
+        if (!varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+#if defined(UNIX_AMD64_ABI)
+        if (varTypeIsStruct(varDsc))
+        {
+            CORINFO_CLASS_HANDLE typeHnd = varDsc->GetStructHnd();
+            assert(typeHnd != nullptr);
+
+            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
+            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
+            assert(structDesc.passedInRegisters);
+
+            unsigned __int8 offset0 = 0;
+            unsigned __int8 offset1 = 0;
+            var_types       type0   = TYP_UNKNOWN;
+            var_types       type1   = TYP_UNKNOWN;
+
+            // Get the eightbyte data
+            compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
+
+            // Move the values into the right registers.
+            //
+
+            // Update varDsc->GetArgReg() and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
+            // argReg is going live. Note that we cannot modify varDsc->GetRegNum() and lvOtherArgReg here
+            // because another basic block may not be expecting it.
+            // Therefore manually update life of argReg.  Note that GT_JMP marks
+            // the end of the basic block and after which reg life and gc info will be recomputed for the new block in
+            // genCodeForBBList().
+            if (type0 != TYP_UNKNOWN)
+            {
+                GetEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->GetArgReg(), varNum, offset0);
+                regSet.SetMaskVars(regSet.GetMaskVars() | genRegMask(varDsc->GetArgReg()));
+                gcInfo.gcMarkRegPtrVal(varDsc->GetArgReg(), type0);
+            }
+
+            if (type1 != TYP_UNKNOWN)
+            {
+                GetEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->GetOtherArgReg(), varNum,
+                                          offset1);
+                regSet.SetMaskVars(regSet.GetMaskVars() | genRegMask(varDsc->GetOtherArgReg()));
+                gcInfo.gcMarkRegPtrVal(varDsc->GetOtherArgReg(), type1);
+            }
+
+            if (varDsc->lvTracked)
+            {
+                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+            }
+        }
+        else
+#endif // !defined(UNIX_AMD64_ABI)
+        {
+            // Register argument
+            CLANG_FORMAT_COMMENT_ANCHOR;
+#ifdef TARGET_X86
+            noway_assert(
+                isRegParamType(genActualType(varDsc->TypeGet())) ||
+                (varTypeIsStruct(varDsc->TypeGet()) && compiler->isTrivialPointerSizedStruct(varDsc->GetStructHnd())));
+#else
+            noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
+#endif // TARGET_X86
+
+            // Is register argument already in the right register?
+            // If not load it from its stack location.
+            var_types loadType = varDsc->lvaArgType();
+
+#ifdef TARGET_X86
+            if (varTypeIsStruct(varDsc->TypeGet()))
+            {
+                // Treat trivial pointer-sized structs as a pointer sized primitive
+                // for the purposes of registers.
+                loadType = TYP_I_IMPL;
+            }
+#endif
+
+            regNumber argReg = varDsc->GetArgReg(); // incoming arg register
+
+            if (varDsc->GetRegNum() != argReg)
+            {
+                assert(genIsValidReg(argReg));
+                GetEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
+
+                // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
+                // Note that we cannot modify varDsc->GetRegNum() here because another basic block may not be
+                // expecting it. Therefore manually update life of argReg.  Note that GT_JMP marks the end of the
+                // basic block and after which reg life and gc info will be recomputed for the new block in
+                // genCodeForBBList().
+                regSet.AddMaskVars(genRegMask(argReg));
+                gcInfo.gcMarkRegPtrVal(argReg, loadType);
+                if (compiler->lvaIsGCTracked(varDsc))
+                {
+#ifdef DEBUG
+                    if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
+                    {
+                        JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
+                    }
+                    else
+                    {
+                        JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
+                    }
+#endif // DEBUG
+
+                    VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
+                }
+            }
+        }
+
+#if FEATURE_VARARG && defined(TARGET_AMD64)
+        // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
+        // register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
+        // be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
+        // values on the stack.
+        if (compiler->info.compIsVarArgs)
+        {
+            regNumber intArgReg;
+            var_types loadType = varDsc->lvaArgType();
+            regNumber argReg   = varDsc->GetArgReg(); // incoming arg register
+
+            if (varTypeIsFloating(loadType))
+            {
+                intArgReg       = compiler->getCallArgIntRegister(argReg);
+                instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
+                inst_RV_RV(ins, argReg, intArgReg, loadType);
+            }
+            else
+            {
+                intArgReg = argReg;
+            }
+
+            fixedIntArgMask |= genRegMask(intArgReg);
+
+            if (intArgReg == REG_ARG_0)
+            {
+                assert(firstArgVarNum == BAD_VAR_NUM);
+                firstArgVarNum = varNum;
+            }
+        }
+#endif // FEATURE_VARARG
+    }
+
+#if FEATURE_VARARG && defined(TARGET_AMD64)
+    // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
+    // load the remaining arg registers (both int and float) from the corresponding
+    // shadow stack slots.  This is for the reason that we don't know the number and type
+    // of non-fixed params passed by the caller, therefore we have to assume the worst case
+    // of caller passing float/double args both in int and float arg regs.
+    //
+    // This doesn't apply to x86, which doesn't pass floating point values in floating
+    // point registers.
+    //
+    // The caller could have passed gc-ref/byref type var args.  Since these are var args
+    // the callee no way of knowing their gc-ness.  Therefore, mark the region that loads
+    // remaining arg registers from shadow stack slots as non-gc interruptible.
+    if (fixedIntArgMask != RBM_NONE)
+    {
+        assert(compiler->info.compIsVarArgs);
+        assert(firstArgVarNum != BAD_VAR_NUM);
+
+        regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
+        if (remainingIntArgMask != RBM_NONE)
+        {
+            instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
+            GetEmitter()->emitDisableGC();
+            for (int argNum = 0, argOffset = 0; argNum < MAX_REG_ARG; ++argNum)
+            {
+                regNumber argReg     = intArgRegs[argNum];
+                regMaskTP argRegMask = genRegMask(argReg);
+
+                if ((remainingIntArgMask & argRegMask) != 0)
+                {
+                    remainingIntArgMask &= ~argRegMask;
+                    GetEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
+
+                    // also load it in corresponding float arg reg
+                    regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
+                    inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
+                }
+
+                argOffset += REGSIZE_BYTES;
+            }
+            GetEmitter()->emitEnableGC();
+        }
+    }
+#endif // FEATURE_VARARG
+}
+
+// produce code for a GT_LEA subnode
+void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
+{
+    emitAttr size = emitTypeSize(lea);
+    genConsumeOperands(lea);
+
+    if (lea->Base() && lea->Index())
+    {
+        regNumber baseReg  = lea->Base()->GetRegNum();
+        regNumber indexReg = lea->Index()->GetRegNum();
+        GetEmitter()->emitIns_R_ARX(INS_lea, size, lea->GetRegNum(), baseReg, indexReg, lea->gtScale, lea->Offset());
+    }
+    else if (lea->Base())
+    {
+        GetEmitter()->emitIns_R_AR(INS_lea, size, lea->GetRegNum(), lea->Base()->GetRegNum(), lea->Offset());
+    }
+    else if (lea->Index())
+    {
+        GetEmitter()->emitIns_R_ARX(INS_lea, size, lea->GetRegNum(), REG_NA, lea->Index()->GetRegNum(), lea->gtScale,
+                                    lea->Offset());
+    }
+
+    genProduceReg(lea);
+}
+
+//------------------------------------------------------------------------
+// genCompareFloat: Generate code for comparing two floating point values
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+void CodeGen::genCompareFloat(GenTree* treeNode)
+{
+    assert(treeNode->OperIsCompare());
+
+    GenTreeOp* tree    = treeNode->AsOp();
+    GenTree*   op1     = tree->gtOp1;
+    GenTree*   op2     = tree->gtOp2;
+    var_types  op1Type = op1->TypeGet();
+    var_types  op2Type = op2->TypeGet();
+
+    genConsumeOperands(tree);
+
+    assert(varTypeIsFloating(op1Type));
+    assert(op1Type == op2Type);
+
+    regNumber   targetReg = treeNode->GetRegNum();
+    instruction ins;
+    emitAttr    cmpAttr;
+
+    GenCondition condition = GenCondition::FromFloatRelop(treeNode);
+
+    if (condition.PreferSwap())
+    {
+        condition = GenCondition::Swap(condition);
+        std::swap(op1, op2);
+    }
+
+    ins     = ins_FloatCompare(op1Type);
+    cmpAttr = emitTypeSize(op1Type);
+
+    GetEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
+
+    // Are we evaluating this into a register?
+    if (targetReg != REG_NA)
+    {
+        if ((condition.GetCode() == GenCondition::FNEU) && (op1->GetRegNum() == op2->GetRegNum()))
+        {
+            // For floating point, `x != x` is a common way of
+            // checking for NaN. So, in the case where both
+            // operands are the same, we can optimize codegen
+            // to only do a single check.
+
+            condition = GenCondition(GenCondition::P);
+        }
+
+        inst_SETCC(condition, treeNode->TypeGet(), targetReg);
+        genProduceReg(tree);
+    }
+}
+
+//------------------------------------------------------------------------
+// genCompareInt: Generate code for comparing ints or, on amd64, longs.
+//
+// Arguments:
+//    treeNode - the compare tree
+//
+// Return Value:
+//    None.
+void CodeGen::genCompareInt(GenTree* treeNode)
+{
+    assert(treeNode->OperIsCompare() || treeNode->OperIs(GT_CMP));
+
+    GenTreeOp* tree          = treeNode->AsOp();
+    GenTree*   op1           = tree->gtOp1;
+    GenTree*   op2           = tree->gtOp2;
+    var_types  op1Type       = op1->TypeGet();
+    var_types  op2Type       = op2->TypeGet();
+    regNumber  targetReg     = tree->GetRegNum();
+    emitter*   emit          = GetEmitter();
+    bool       canReuseFlags = false;
+
+    genConsumeOperands(tree);
+
+    assert(!op1->isContainedIntOrIImmed());
+    assert(!varTypeIsFloating(op2Type));
+
+    instruction ins;
+    var_types   type = TYP_UNKNOWN;
+
+    if (tree->OperIs(GT_TEST_EQ, GT_TEST_NE))
+    {
+        ins = INS_test;
+
+        // Unlike many xarch instructions TEST doesn't have a form with a 16/32/64 bit first operand and
+        // an 8 bit immediate second operand. But if the immediate value fits in 8 bits then we can simply
+        // emit a 8 bit TEST instruction, unless we're targeting x86 and the first operand is a non-byteable
+        // register.
+        // Note that lowering does something similar but its main purpose is to allow memory operands to be
+        // contained so it doesn't handle other kind of operands. It could do more but on x86 that results
+        // in additional register constrains and that may be worse than wasting 3 bytes on an immediate.
+        if (
+#ifdef TARGET_X86
+            (!op1->isUsedFromReg() || isByteReg(op1->GetRegNum())) &&
+#endif
+            (op2->IsCnsIntOrI() && genSmallTypeCanRepresentValue(TYP_UBYTE, op2->AsIntCon()->IconValue())))
+        {
+            type = TYP_UBYTE;
+        }
+    }
+    else if (op1->isUsedFromReg() && op2->IsIntegralConst(0))
+    {
+        if (compiler->opts.OptimizationEnabled())
+        {
+            emitAttr op1Size = emitActualTypeSize(op1->TypeGet());
+            assert((int)op1Size >= 4);
+
+            // Optimize "x<0" and "x>=0" to "x>>31" if "x" is not a jump condition and in a reg.
+            // Morph/Lowering are responsible to rotate "0<x" to "x>0" so we won't handle it here.
+            if ((targetReg != REG_NA) && tree->OperIs(GT_LT, GT_GE) && !tree->IsUnsigned())
+            {
+                if (targetReg != op1->GetRegNum())
+                {
+                    inst_RV_RV(INS_mov, targetReg, op1->GetRegNum(), op1->TypeGet());
+                }
+                if (tree->OperIs(GT_GE))
+                {
+                    // emit "not" for "x>=0" case
+                    inst_RV(INS_not, targetReg, op1->TypeGet());
+                }
+                inst_RV_IV(INS_shr_N, targetReg, (int)op1Size * 8 - 1, op1Size);
+                genProduceReg(tree);
+                return;
+            }
+            canReuseFlags = true;
+        }
+
+        // We're comparing a register to 0 so we can generate "test reg1, reg1"
+        // instead of the longer "cmp reg1, 0"
+        ins = INS_test;
+        op2 = op1;
+    }
+    else
+    {
+        ins = INS_cmp;
+    }
+
+    if (type == TYP_UNKNOWN)
+    {
+        if (op1Type == op2Type)
+        {
+            type = op1Type;
+        }
+        else if (genTypeSize(op1Type) == genTypeSize(op2Type))
+        {
+            // If the types are different but have the same size then we'll use TYP_INT or TYP_LONG.
+            // This primarily deals with small type mixes (e.g. byte/ubyte) that need to be widened
+            // and compared as int. We should not get long type mixes here but handle that as well
+            // just in case.
+            type = genTypeSize(op1Type) == 8 ? TYP_LONG : TYP_INT;
+        }
+        else
+        {
+            // In the types are different simply use TYP_INT. This deals with small type/int type
+            // mixes (e.g. byte/short ubyte/int) that need to be widened and compared as int.
+            // Lowering is expected to handle any mixes that involve long types (e.g. int/long).
+            type = TYP_INT;
+        }
+
+        // The common type cannot be smaller than any of the operand types, we're probably mixing int/long
+        assert(genTypeSize(type) >= max(genTypeSize(op1Type), genTypeSize(op2Type)));
+        // Small unsigned int types (TYP_BOOL can use anything) should use unsigned comparisons
+        assert(!(varTypeIsSmallInt(type) && varTypeIsUnsigned(type)) || ((tree->gtFlags & GTF_UNSIGNED) != 0));
+        // If op1 is smaller then it cannot be in memory, we're probably missing a cast
+        assert((genTypeSize(op1Type) >= genTypeSize(type)) || !op1->isUsedFromMemory());
+        // If op2 is smaller then it cannot be in memory, we're probably missing a cast
+        assert((genTypeSize(op2Type) >= genTypeSize(type)) || !op2->isUsedFromMemory());
+        // If we ended up with a small type and op2 is a constant then make sure we don't lose constant bits
+        assert(!op2->IsCnsIntOrI() || !varTypeIsSmall(type) ||
+               genSmallTypeCanRepresentValue(type, op2->AsIntCon()->IconValue()));
+    }
+
+    // The type cannot be larger than the machine word size
+    assert(genTypeSize(type) <= genTypeSize(TYP_I_IMPL));
+    // TYP_UINT and TYP_ULONG should not appear here, only small types can be unsigned
+    assert(!varTypeIsUnsigned(type) || varTypeIsSmall(type));
+
+    bool needsOCFlags = !tree->OperIs(GT_EQ, GT_NE);
+    if (canReuseFlags && emit->AreFlagsSetToZeroCmp(op1->GetRegNum(), emitTypeSize(type), needsOCFlags))
+    {
+        JITDUMP("Not emitting compare due to flags being already set\n");
+    }
+    else
+    {
+        emit->emitInsBinary(ins, emitTypeSize(type), op1, op2);
+    }
+
+    // Are we evaluating this into a register?
+    if (targetReg != REG_NA)
+    {
+        inst_SETCC(GenCondition::FromIntegralRelop(tree), tree->TypeGet(), targetReg);
+        genProduceReg(tree);
+    }
+}
+
+#if !defined(TARGET_64BIT)
+//------------------------------------------------------------------------
+// genLongToIntCast: Generate code for long to int casts on x86.
+//
+// Arguments:
+//    cast - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    The cast node and its sources (via GT_LONG) must have been assigned registers.
+//    The destination cannot be a floating point type or a small integer type.
+//
+void CodeGen::genLongToIntCast(GenTree* cast)
+{
+    assert(cast->OperGet() == GT_CAST);
+
+    GenTree* src = cast->gtGetOp1();
+    noway_assert(src->OperGet() == GT_LONG);
+
+    genConsumeRegs(src);
+
+    var_types srcType  = ((cast->gtFlags & GTF_UNSIGNED) != 0) ? TYP_ULONG : TYP_LONG;
+    var_types dstType  = cast->CastToType();
+    regNumber loSrcReg = src->gtGetOp1()->GetRegNum();
+    regNumber hiSrcReg = src->gtGetOp2()->GetRegNum();
+    regNumber dstReg   = cast->GetRegNum();
+
+    assert((dstType == TYP_INT) || (dstType == TYP_UINT));
+    assert(genIsValidIntReg(loSrcReg));
+    assert(genIsValidIntReg(hiSrcReg));
+    assert(genIsValidIntReg(dstReg));
+
+    if (cast->gtOverflow())
+    {
+        //
+        // Generate an overflow check for [u]long to [u]int casts:
+        //
+        // long  -> int  - check if the upper 33 bits are all 0 or all 1
+        //
+        // ulong -> int  - check if the upper 33 bits are all 0
+        //
+        // long  -> uint - check if the upper 32 bits are all 0
+        // ulong -> uint - check if the upper 32 bits are all 0
+        //
+
+        if ((srcType == TYP_LONG) && (dstType == TYP_INT))
+        {
+            BasicBlock* allOne  = genCreateTempLabel();
+            BasicBlock* success = genCreateTempLabel();
+
+            inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
+            inst_JMP(EJ_js, allOne);
+
+            inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
+            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+            inst_JMP(EJ_jmp, success);
+
+            genDefineTempLabel(allOne);
+            inst_RV_IV(INS_cmp, hiSrcReg, -1, EA_4BYTE);
+            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+
+            genDefineTempLabel(success);
+        }
+        else
+        {
+            if ((srcType == TYP_ULONG) && (dstType == TYP_INT))
+            {
+                inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
+                genJumpToThrowHlpBlk(EJ_js, SCK_OVERFLOW);
+            }
+
+            inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
+            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+        }
+    }
+
+    if (dstReg != loSrcReg)
+    {
+        inst_RV_RV(INS_mov, dstReg, loSrcReg, TYP_INT, EA_4BYTE);
+    }
+
+    genProduceReg(cast);
+}
+#endif
+
+//------------------------------------------------------------------------
+// genIntCastOverflowCheck: Generate overflow checking code for an integer cast.
+//
+// Arguments:
+//    cast - The GT_CAST node
+//    desc - The cast description
+//    reg  - The register containing the value to check
+//
+void CodeGen::genIntCastOverflowCheck(GenTreeCast* cast, const GenIntCastDesc& desc, regNumber reg)
+{
+    switch (desc.CheckKind())
+    {
+        case GenIntCastDesc::CHECK_POSITIVE:
+            GetEmitter()->emitIns_R_R(INS_test, EA_SIZE(desc.CheckSrcSize()), reg, reg);
+            genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+            break;
+
+#ifdef TARGET_64BIT
+        case GenIntCastDesc::CHECK_UINT_RANGE:
+        {
+            // We need to check if the value is not greater than 0xFFFFFFFF but this value
+            // cannot be encoded in an immediate operand. Use a right shift to test if the
+            // upper 32 bits are zero. This requires a temporary register.
+            const regNumber tempReg = cast->GetSingleTempReg();
+            assert(tempReg != reg);
+            GetEmitter()->emitIns_R_R(INS_mov, EA_8BYTE, tempReg, reg);
+            GetEmitter()->emitIns_R_I(INS_shr_N, EA_8BYTE, tempReg, 32);
+            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
+        }
+        break;
+
+        case GenIntCastDesc::CHECK_POSITIVE_INT_RANGE:
+            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
+            genJumpToThrowHlpBlk(EJ_ja, SCK_OVERFLOW);
+            break;
+
+        case GenIntCastDesc::CHECK_INT_RANGE:
+            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
+            genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
+            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MIN);
+            genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+            break;
+#endif
+
+        default:
+        {
+            assert(desc.CheckKind() == GenIntCastDesc::CHECK_SMALL_INT_RANGE);
+            const int castMaxValue = desc.CheckSmallIntMax();
+            const int castMinValue = desc.CheckSmallIntMin();
+
+            GetEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMaxValue);
+            genJumpToThrowHlpBlk((castMinValue == 0) ? EJ_ja : EJ_jg, SCK_OVERFLOW);
+
+            if (castMinValue != 0)
+            {
+                GetEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMinValue);
+                genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
+            }
+        }
+        break;
+    }
+}
+
+//------------------------------------------------------------------------
+// genIntToIntCast: Generate code for an integer cast, with or without overflow check.
+//
+// Arguments:
+//    cast - The GT_CAST node
+//
+// Assumptions:
+//    The cast node is not a contained node and must have an assigned register.
+//    Neither the source nor target type can be a floating point type.
+//    On x86 casts to (U)BYTE require that the source be in a byte register.
+//
+// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
+//
+void CodeGen::genIntToIntCast(GenTreeCast* cast)
+{
+    genConsumeRegs(cast->gtGetOp1());
+
+    const regNumber srcReg = cast->gtGetOp1()->GetRegNum();
+    const regNumber dstReg = cast->GetRegNum();
+    emitter*        emit   = GetEmitter();
+
+    assert(genIsValidIntReg(srcReg));
+    assert(genIsValidIntReg(dstReg));
+
+    GenIntCastDesc desc(cast);
+
+    if (desc.CheckKind() != GenIntCastDesc::CHECK_NONE)
+    {
+        genIntCastOverflowCheck(cast, desc, srcReg);
+    }
+
+    if ((desc.ExtendKind() != GenIntCastDesc::COPY) || (srcReg != dstReg))
+    {
+        instruction ins;
+        unsigned    insSize;
+        bool        canSkip = false;
+
+        switch (desc.ExtendKind())
+        {
+            case GenIntCastDesc::ZERO_EXTEND_SMALL_INT:
+                ins     = INS_movzx;
+                insSize = desc.ExtendSrcSize();
+                break;
+            case GenIntCastDesc::SIGN_EXTEND_SMALL_INT:
+                ins     = INS_movsx;
+                insSize = desc.ExtendSrcSize();
+                break;
+#ifdef TARGET_64BIT
+            case GenIntCastDesc::ZERO_EXTEND_INT:
+                // We can skip emitting this zero extending move if the previous instruction zero extended implicitly
+                if ((srcReg == dstReg) && compiler->opts.OptimizationEnabled())
+                {
+                    canSkip = emit->AreUpper32BitsZero(srcReg);
+                }
+
+                ins     = INS_mov;
+                insSize = 4;
+                break;
+            case GenIntCastDesc::SIGN_EXTEND_INT:
+                ins     = INS_movsxd;
+                insSize = 4;
+                break;
+#endif
+            default:
+                assert(desc.ExtendKind() == GenIntCastDesc::COPY);
+                assert(srcReg != dstReg);
+                ins     = INS_mov;
+                insSize = desc.ExtendSrcSize();
+                break;
+        }
+
+        if (canSkip)
+        {
+            JITDUMP("\n -- suppressing emission as previous instruction already properly extends.\n");
+        }
+        else
+        {
+            emit->emitIns_R_R(ins, EA_ATTR(insSize), dstReg, srcReg);
+        }
+    }
+
+    genProduceReg(cast);
+}
+
+//------------------------------------------------------------------------
+// genFloatToFloatCast: Generate code for a cast between float and double
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    The cast is between float and double or vice versa.
+//
+void CodeGen::genFloatToFloatCast(GenTree* treeNode)
+{
+    // float <--> double conversions are always non-overflow ones
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->GetRegNum();
+    assert(genIsValidFloatReg(targetReg));
+
+    GenTree* op1 = treeNode->AsOp()->gtOp1;
+#ifdef DEBUG
+    // If not contained, must be a valid float reg.
+    if (op1->isUsedFromReg())
+    {
+        assert(genIsValidFloatReg(op1->GetRegNum()));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+    genConsumeOperands(treeNode->AsOp());
+    if (srcType == dstType && (op1->isUsedFromReg() && (targetReg == op1->GetRegNum())))
+    {
+        // source and destinations types are the same and also reside in the same register.
+        // we just need to consume and produce the reg in this case.
+        ;
+    }
+    else
+    {
+        instruction ins = ins_FloatConv(dstType, srcType);
+        GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genIntToFloatCast: Generate code to cast an int/long to float/double
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    SrcType= int32/uint32/int64/uint64 and DstType=float/double.
+//
+void CodeGen::genIntToFloatCast(GenTree* treeNode)
+{
+    // int type --> float/double conversions are always non-overflow ones
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->GetRegNum();
+    assert(genIsValidFloatReg(targetReg));
+
+    GenTree* op1 = treeNode->AsOp()->gtOp1;
+#ifdef DEBUG
+    if (op1->isUsedFromReg())
+    {
+        assert(genIsValidIntReg(op1->GetRegNum()));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
+
+#if !defined(TARGET_64BIT)
+    // We expect morph to replace long to float/double casts with helper calls
+    noway_assert(!varTypeIsLong(srcType));
+#endif // !defined(TARGET_64BIT)
+
+    // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
+    // ensure srcType of a cast is non gc-type.  Codegen should never see BYREF as source type except
+    // for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
+    // as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
+    // Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
+    // temp and using temp as operand of cast operation.
+    if (srcType == TYP_BYREF)
+    {
+        noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR || op1->OperGet() == GT_LCL_FLD_ADDR);
+        srcType = TYP_I_IMPL;
+    }
+
+    // force the srcType to unsigned if GT_UNSIGNED flag is set
+    if (treeNode->gtFlags & GTF_UNSIGNED)
+    {
+        srcType = genUnsignedType(srcType);
+    }
+
+    noway_assert(!varTypeIsGC(srcType));
+
+    // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
+    // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
+    // either the front-end or lowering phase to have generated two levels of cast.
+    // The first one is for widening smaller int type to int32 and the second one is
+    // to the float/double.
+    emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
+    noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
+    // here since they should have been lowered apropriately.
+    noway_assert(srcType != TYP_UINT);
+    noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
+
+    // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
+    // which does a partial write to lower 4/8 bytes of xmm register keeping the other
+    // upper bytes unmodified.  If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
+    // the partial write could introduce a false dependency and could cause a stall
+    // if there are further uses of xmmReg. We have such a case occurring with a
+    // customer reported version of SpectralNorm benchmark, resulting in 2x perf
+    // regression.  To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
+    // cvtsi2ss/sd instruction.
+
+    genConsumeOperands(treeNode->AsOp());
+    GetEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->GetRegNum(), treeNode->GetRegNum());
+
+    // Note that here we need to specify srcType that will determine
+    // the size of source reg/mem operand and rex.w prefix.
+    instruction ins = ins_FloatConv(dstType, TYP_INT);
+    GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
+
+    // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
+    // will interpret ULONG value as LONG.  Hence we need to adjust the
+    // result if sign-bit of srcType is set.
+    if (srcType == TYP_ULONG)
+    {
+        // The instruction sequence below is less accurate than what clang
+        // and gcc generate. However, we keep the current sequence for backward compatibility.
+        // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
+        // should be also updated for consistent conversion result.
+        assert(dstType == TYP_DOUBLE);
+        assert(op1->isUsedFromReg());
+
+        // Set the flags without modifying op1.
+        // test op1Reg, op1Reg
+        inst_RV_RV(INS_test, op1->GetRegNum(), op1->GetRegNum(), srcType);
+
+        // No need to adjust result if op1 >= 0 i.e. positive
+        // Jge label
+        BasicBlock* label = genCreateTempLabel();
+        inst_JMP(EJ_jge, label);
+
+        // Adjust the result
+        // result = result + 0x43f00000 00000000
+        // addsd resultReg,  0x43f00000 00000000
+        CORINFO_FIELD_HANDLE* cns = &u8ToDblBitmask;
+        if (*cns == nullptr)
+        {
+            double d;
+            static_assert_no_msg(sizeof(double) == sizeof(__int64));
+            *((__int64*)&d) = 0x43f0000000000000LL;
+
+            *cns = GetEmitter()->emitFltOrDblConst(d, EA_8BYTE);
+        }
+        GetEmitter()->emitIns_R_C(INS_addsd, EA_8BYTE, treeNode->GetRegNum(), *cns, 0);
+
+        genDefineTempLabel(label);
+    }
+
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genFloatToIntCast: Generate code to cast float/double to int/long
+//
+// Arguments:
+//    treeNode - The GT_CAST node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    Cast is a non-overflow conversion.
+//    The treeNode must have an assigned register.
+//    SrcType=float/double and DstType= int32/uint32/int64/uint64
+//
+// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
+//
+void CodeGen::genFloatToIntCast(GenTree* treeNode)
+{
+    // we don't expect to see overflow detecting float/double --> int type conversions here
+    // as they should have been converted into helper calls by front-end.
+    assert(treeNode->OperGet() == GT_CAST);
+    assert(!treeNode->gtOverflow());
+
+    regNumber targetReg = treeNode->GetRegNum();
+    assert(genIsValidIntReg(targetReg));
+
+    GenTree* op1 = treeNode->AsOp()->gtOp1;
+#ifdef DEBUG
+    if (op1->isUsedFromReg())
+    {
+        assert(genIsValidFloatReg(op1->GetRegNum()));
+    }
+#endif
+
+    var_types dstType = treeNode->CastToType();
+    var_types srcType = op1->TypeGet();
+    assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
+
+    // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
+    // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
+    // front-end or lowering phase to have generated two levels of cast. The first one is
+    // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
+    // the required smaller int type.
+    emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
+    noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // We shouldn't be seeing uint64 here as it should have been converted
+    // into a helper call by either front-end or lowering phase.
+    noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
+
+    // If the dstType is TYP_UINT, we have 32-bits to encode the
+    // float number. Any of 33rd or above bits can be the sign bit.
+    // To achieve it we pretend as if we are converting it to a long.
+    if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
+    {
+        dstType = TYP_LONG;
+    }
+
+    // Note that we need to specify dstType here so that it will determine
+    // the size of destination integer register and also the rex.w prefix.
+    genConsumeOperands(treeNode->AsOp());
+    instruction ins = ins_FloatConv(TYP_INT, srcType);
+    GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
+    genProduceReg(treeNode);
+}
+
+//------------------------------------------------------------------------
+// genCkfinite: Generate code for ckfinite opcode.
+//
+// Arguments:
+//    treeNode - The GT_CKFINITE node
+//
+// Return Value:
+//    None.
+//
+// Assumptions:
+//    GT_CKFINITE node has reserved an internal register.
+//
+// TODO-XArch-CQ - mark the operand as contained if known to be in
+// memory (e.g. field or an array element).
+//
+void CodeGen::genCkfinite(GenTree* treeNode)
+{
+    assert(false);
+}
+
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+int CodeGenInterface::genSPtoFPdelta() const
+{
+    int delta;
+
+#ifdef UNIX_AMD64_ABI
+
+    // We require frame chaining on Unix to support native tool unwinding (such as
+    // unwinding by the native debugger). We have a CLR-only extension to the
+    // unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
+    // If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
+    delta = genTotalFrameSize();
+
+#else // !UNIX_AMD64_ABI
+
+    // As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
+    // RBP needs to be reported in unwind codes.  This case would arise for methods
+    // with localloc.
+    if (compiler->compLocallocUsed)
+    {
+        // We cannot base delta computation on compLclFrameSize since it changes from
+        // tentative to final frame layout and hence there is a possibility of
+        // under-estimating offset of vars from FP, which in turn results in under-
+        // estimating instruction size.
+        //
+        // To be predictive and so as never to under-estimate offset of vars from FP
+        // we will always position FP at min(240, outgoing arg area size).
+        delta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize);
+    }
+    else if (compiler->opts.compDbgEnC)
+    {
+        // vm assumption on EnC methods is that rsp and rbp are equal
+        delta = 0;
+    }
+    else
+    {
+        delta = genTotalFrameSize();
+    }
+
+#endif // !UNIX_AMD64_ABI
+
+    return delta;
+}
+
+//---------------------------------------------------------------------
+// genTotalFrameSize - return the total size of the stack frame, including local size,
+// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
+// return address.
+//
+// Return value:
+//    Total frame size
+//
+
+int CodeGenInterface::genTotalFrameSize() const
+{
+    assert(!IsUninitialized(compiler->compCalleeRegsPushed));
+
+    int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
+
+    assert(totalFrameSize >= 0);
+    return totalFrameSize;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
+// This number is going to be negative, since the Caller-SP is at a higher
+// address than the frame pointer.
+//
+// There must be a frame pointer to call this function!
+//
+// We can't compute this directly from the Caller-SP, since the frame pointer
+// is based on a maximum delta from Initial-SP, so first we find SP, then
+// compute the FP offset.
+
+int CodeGenInterface::genCallerSPtoFPdelta() const
+{
+    assert(isFramePointerUsed());
+    int callerSPtoFPdelta;
+
+    callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
+
+    assert(callerSPtoFPdelta <= 0);
+    return callerSPtoFPdelta;
+}
+
+//---------------------------------------------------------------------
+// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
+//
+// This number will be negative.
+
+int CodeGenInterface::genCallerSPtoInitialSPdelta() const
+{
+    int callerSPtoSPdelta = 0;
+
+    callerSPtoSPdelta -= genTotalFrameSize();
+    callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
+
+    // compCalleeRegsPushed does not account for the frame pointer
+    // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
+    if (isFramePointerUsed())
+    {
+        callerSPtoSPdelta -= REGSIZE_BYTES;
+    }
+
+    assert(callerSPtoSPdelta <= 0);
+    return callerSPtoSPdelta;
+}
+#endif // TARGET_AMD64
+
+//-----------------------------------------------------------------------------------------
+// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
+//
+// Arguments:
+//    treeNode  - tree node
+//
+// Return value:
+//    None
+//
+// Assumptions:
+//     i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
+//    ii) tree type is floating point type.
+//   iii) caller of this routine needs to call genProduceReg()
+void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
+{
+    regNumber targetReg  = treeNode->GetRegNum();
+    regNumber operandReg = genConsumeReg(treeNode->gtGetOp1());
+    emitAttr  size       = emitTypeSize(treeNode);
+
+    assert(varTypeIsFloating(treeNode->TypeGet()));
+    assert(treeNode->gtGetOp1()->isUsedFromReg());
+
+    CORINFO_FIELD_HANDLE* maskFld = nullptr;
+    UINT64                mask    = 0;
+    instruction           ins     = INS_invalid;
+
+    if (treeNode->OperIs(GT_NEG))
+    {
+        // Neg(x) = flip the sign bit.
+        // Neg(f) = f ^ 0x80000000 x4 (packed)
+        // Neg(d) = d ^ 0x8000000000000000 x2 (packed)
+        ins     = INS_xorps;
+        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x8000000080000000UL : 0x8000000000000000UL;
+        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &negBitmaskFlt : &negBitmaskDbl;
+    }
+    else if (treeNode->OperIs(GT_INTRINSIC))
+    {
+        assert(treeNode->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Abs);
+        // Abs(x) = set sign-bit to zero
+        // Abs(f) = f & 0x7fffffff x4 (packed)
+        // Abs(d) = d & 0x7fffffffffffffff x2 (packed)
+        ins     = INS_andps;
+        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x7fffffff7fffffffUL : 0x7fffffffffffffffUL;
+        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &absBitmaskFlt : &absBitmaskDbl;
+    }
+    else
+    {
+        assert(!"genSSE2BitwiseOp: unsupported oper");
+    }
+
+    if (*maskFld == nullptr)
+    {
+        UINT64 maskPack[] = {mask, mask};
+        *maskFld          = GetEmitter()->emitBlkConst(&maskPack, 16, 16, treeNode->TypeGet());
+    }
+
+    GetEmitter()->emitIns_SIMD_R_R_C(ins, size, targetReg, operandReg, *maskFld, 0);
+}
+
+//-----------------------------------------------------------------------------------------
+// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation
+//
+// Arguments:
+//    treeNode  - tree node
+//
+// Return value:
+//    None
+//
+// Assumptions:
+//     i) SSE4.1 is supported by the underlying hardware
+//    ii) treeNode oper is a GT_INTRINSIC
+//   iii) treeNode type is a floating point type
+//    iv) treeNode is not used from memory
+//     v) tree oper is NI_System_Math{F}_Round, _Ceiling, or _Floor
+//    vi) caller of this routine needs to call genProduceReg()
+void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)
+{
+    assert(false);
+}
+
+//---------------------------------------------------------------------
+// genIntrinsic - generate code for a given intrinsic
+//
+// Arguments
+//    treeNode - the GT_INTRINSIC node
+//
+// Return value:
+//    None
+//
+void CodeGen::genIntrinsic(GenTree* treeNode)
+{
+    // Right now only Sqrt/Abs are treated as math intrinsics.
+    switch (treeNode->AsIntrinsic()->gtIntrinsicName)
+    {
+        case NI_System_Math_Sqrt:
+        {
+            // Both operand and its result must be of the same floating point type.
+            GenTree* srcNode = treeNode->AsOp()->gtOp1;
+            assert(varTypeIsFloating(srcNode));
+            assert(srcNode->TypeGet() == treeNode->TypeGet());
+
+            genConsumeOperands(treeNode->AsOp());
+            GetEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode, srcNode);
+            break;
+        }
+
+        case NI_System_Math_Abs:
+            genSSE2BitwiseOp(treeNode);
+            break;
+
+        case NI_System_Math_Round:
+        case NI_System_Math_Ceiling:
+        case NI_System_Math_Floor:
+            genSSE41RoundOp(treeNode->AsOp());
+            break;
+
+        default:
+            assert(!"genIntrinsic: Unsupported intrinsic");
+            unreached();
+    }
+
+    genProduceReg(treeNode);
+}
+
+//----------------------------------------------------------------------
+// genBitCast - Generate the instruction to move a value between register files
+//
+// Arguments
+//    targetType - the destination type
+//    targetReg  - the destination register
+//    srcType    - the source type
+//    srcReg     - the source register
+//
+void CodeGen::genBitCast(var_types targetType, regNumber targetReg, var_types srcType, regNumber srcReg)
+{
+    const bool srcFltReg = varTypeUsesFloatReg(srcType) || varTypeIsSIMD(srcType);
+    assert(srcFltReg == genIsValidFloatReg(srcReg));
+    const bool dstFltReg = varTypeUsesFloatReg(targetType) || varTypeIsSIMD(targetType);
+    assert(dstFltReg == genIsValidFloatReg(targetReg));
+    if (srcFltReg != dstFltReg)
+    {
+        instruction ins;
+        regNumber   fltReg;
+        regNumber   intReg;
+        if (dstFltReg)
+        {
+            ins    = ins_CopyIntToFloat(srcType, targetType);
+            fltReg = targetReg;
+            intReg = srcReg;
+        }
+        else
+        {
+            ins    = ins_CopyFloatToInt(srcType, targetType);
+            intReg = targetReg;
+            fltReg = srcReg;
+        }
+        inst_RV_RV(ins, fltReg, intReg, targetType);
+    }
+    else if (targetReg != srcReg)
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, srcReg, targetType);
+    }
+}
+
+//----------------------------------------------------------------------
+// genCodeForBitCast - Generate code for a GT_BITCAST that is not contained
+//
+// Arguments
+//    treeNode - the GT_BITCAST for which we're generating code
+//
+void CodeGen::genCodeForBitCast(GenTreeOp* treeNode)
+{
+    regNumber targetReg  = treeNode->GetRegNum();
+    var_types targetType = treeNode->TypeGet();
+    GenTree*  op1        = treeNode->gtGetOp1();
+    genConsumeRegs(op1);
+
+    if (op1->isContained())
+    {
+        assert(op1->IsLocal() || op1->isIndir());
+        if (genIsRegCandidateLocal(op1))
+        {
+            unsigned lclNum = op1->AsLclVar()->GetLclNum();
+            GetEmitter()->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lclNum)),
+                                      emitTypeSize(treeNode), targetReg, lclNum, 0);
+        }
+        else
+        {
+            op1->gtType = treeNode->TypeGet();
+            op1->SetRegNum(targetReg);
+            op1->ClearContained();
+            JITDUMP("Changing type of BITCAST source to load directly.");
+            genCodeForTreeNode(op1);
+        }
+    }
+    else
+    {
+        genBitCast(targetType, targetReg, op1->TypeGet(), op1->GetRegNum());
+    }
+    genProduceReg(treeNode);
+}
+
+//-------------------------------------------------------------------------- //
+// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
+//
+// Arguments
+//    treeNode - the GT_PUTARG_STK node
+//
+// Return value:
+//    The number of the base variable.
+//
+// Note:
+//    If tail call the outgoing args are placed in the caller's incoming arg stack space.
+//    Otherwise, they go in the outgoing arg area on the current frame.
+//
+//    On Windows the caller always creates slots (homing space) in its frame for the
+//    first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
+//    For System V systems there is no such calling convention requirement, and the code needs to find
+//    the first stack passed argument from the caller. This is done by iterating over
+//    all the lvParam variables and finding the first with GetArgReg() equals to REG_STK.
+//
+//unsigned CodeGen::getBaseVarForPutArgStk(GenTree* treeNode)
+//{
+//    assert(false);
+//}
+
+//---------------------------------------------------------------------
+// genAlignStackBeforeCall: Align the stack if necessary before a call.
+//
+// Arguments:
+//    call - the call node.
+//
+void CodeGen::genAlignStackBeforeCall(GenTreeCall* call)
+{
+#if defined(UNIX_X86_ABI)
+
+    // Have we aligned the stack yet?
+    if (!call->fgArgInfo->IsStkAlignmentDone())
+    {
+        // We haven't done any stack alignment yet for this call.  We might need to create
+        // an alignment adjustment, even if this function itself doesn't have any stack args.
+        // This can happen if this function call is part of a nested call sequence, and the outer
+        // call has already pushed some arguments.
+
+        unsigned stkLevel = genStackLevel + call->fgArgInfo->GetStkSizeBytes();
+        call->fgArgInfo->ComputeStackAlignment(stkLevel);
+
+        unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
+        if (padStkAlign != 0)
+        {
+            // Now generate the alignment
+            inst_RV_IV(INS_sub, REG_SPBASE, padStkAlign, EA_PTRSIZE);
+            AddStackLevel(padStkAlign);
+            AddNestedAlignment(padStkAlign);
+        }
+
+        call->fgArgInfo->SetStkAlignmentDone();
+    }
+
+#endif // UNIX_X86_ABI
+}
+
+//---------------------------------------------------------------------
+// genRemoveAlignmentAfterCall: After a call, remove the alignment
+// added before the call, if any.
+//
+// Arguments:
+//    call - the call node.
+//    bias - additional stack adjustment
+//
+// Note:
+//    When bias > 0, caller should adjust stack level appropriately as
+//    bias is not considered when adjusting stack level.
+//
+void CodeGen::genRemoveAlignmentAfterCall(GenTreeCall* call, unsigned bias)
+{
+#if defined(TARGET_X86)
+#if defined(UNIX_X86_ABI)
+    // Put back the stack pointer if there was any padding for stack alignment
+    unsigned padStkAlign  = call->fgArgInfo->GetStkAlign();
+    unsigned padStkAdjust = padStkAlign + bias;
+
+    if (padStkAdjust != 0)
+    {
+        inst_RV_IV(INS_add, REG_SPBASE, padStkAdjust, EA_PTRSIZE);
+        SubtractStackLevel(padStkAlign);
+        SubtractNestedAlignment(padStkAlign);
+    }
+#else  // UNIX_X86_ABI
+    if (bias != 0)
+    {
+        genAdjustSP(bias);
+    }
+#endif // !UNIX_X86_ABI_
+#else  // TARGET_X86
+    assert(bias == 0);
+#endif // !TARGET_X86
+}
+
+#ifdef TARGET_X86
+
+//---------------------------------------------------------------------
+// genAdjustStackForPutArgStk:
+//    adjust the stack pointer for a putArgStk node if necessary.
+//
+// Arguments:
+//    putArgStk - the putArgStk node.
+//
+// Returns: true if the stack pointer was adjusted; false otherwise.
+//
+// Notes:
+//    Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
+//    false if the stack arg needs to be stored at the current stack
+//    pointer address. This is exactly the opposite of the return value
+//    of this function.
+//
+bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    const unsigned argSize = putArgStk->GetStackByteSize();
+    GenTree*       source  = putArgStk->gtGetOp1();
+
+#ifdef FEATURE_SIMD
+    if (!source->OperIs(GT_FIELD_LIST) && varTypeIsSIMD(source))
+    {
+        inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
+        AddStackLevel(argSize);
+        m_pushStkArg = false;
+        return true;
+    }
+#endif // FEATURE_SIMD
+
+    // If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack.
+    // This is set in Lowering, and is true if and only if:
+    // - This argument contains any GC pointers OR
+    // - It is a GT_FIELD_LIST OR
+    // - It is less than 16 bytes in size.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef DEBUG
+    switch (putArgStk->gtPutArgStkKind)
+    {
+        case GenTreePutArgStk::Kind::RepInstr:
+        case GenTreePutArgStk::Kind::Unroll:
+            assert(!source->AsObj()->GetLayout()->HasGCPtr() && (argSize >= 16));
+            break;
+        case GenTreePutArgStk::Kind::Push:
+        case GenTreePutArgStk::Kind::PushAllSlots:
+            assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || (argSize < 16));
+            break;
+        case GenTreePutArgStk::Kind::Invalid:
+        default:
+            assert(!"Uninitialized GenTreePutArgStk::Kind");
+            break;
+    }
+#endif // DEBUG
+
+    if (putArgStk->isPushKind())
+    {
+        m_pushStkArg = true;
+        return false;
+    }
+    else
+    {
+        m_pushStkArg = false;
+
+        // If argSize is large, we need to probe the stack like we do in the prolog (genAllocLclFrame)
+        // or for localloc (genLclHeap), to ensure we touch the stack pages sequentially, and don't miss
+        // the stack guard pages. The prolog probes, but we don't know at this point how much higher
+        // the last probed stack pointer value is. We default a threshold. Any size below this threshold
+        // we are guaranteed the stack has been probed. Above this threshold, we don't know. The threshold
+        // should be high enough to cover all common cases. Increasing the threshold means adding a few
+        // more "lowest address of stack" probes in the prolog. Since this is relatively rare, add it to
+        // stress modes.
+
+        if ((argSize >= ARG_STACK_PROBE_THRESHOLD_BYTES) ||
+            compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 5))
+        {
+            genStackPointerConstantAdjustmentLoopWithProbe(-(ssize_t)argSize, REG_NA);
+        }
+        else
+        {
+            inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
+        }
+
+        AddStackLevel(argSize);
+        return true;
+    }
+}
+
+//---------------------------------------------------------------------
+// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
+//
+// Arguments
+//    treeNode      - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
+{
+    GenTreeFieldList* const fieldList = putArgStk->gtOp1->AsFieldList();
+    assert(fieldList != nullptr);
+
+    // Set m_pushStkArg and pre-adjust the stack if necessary.
+    const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
+
+    // For now, we only support the "push" case; we will push a full slot for the first field of each slot
+    // within the struct.
+    assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
+
+    // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
+    // (Note that this mode is not currently being used.)
+    // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
+    // in reverse order, so we start with the current field offset at the size of the struct arg (which must be
+    // a multiple of the target pointer size).
+    unsigned  currentOffset   = (preAdjustedStack) ? 0 : putArgStk->GetStackByteSize();
+    unsigned  prevFieldOffset = currentOffset;
+    regNumber intTmpReg       = REG_NA;
+    regNumber simdTmpReg      = REG_NA;
+    if (putArgStk->AvailableTempRegCount() != 0)
+    {
+        regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
+        if ((rsvdRegs & RBM_ALLINT) != 0)
+        {
+            intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT);
+            assert(genIsValidIntReg(intTmpReg));
+        }
+        if ((rsvdRegs & RBM_ALLFLOAT) != 0)
+        {
+            simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT);
+            assert(genIsValidFloatReg(simdTmpReg));
+        }
+        assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
+    }
+
+    for (GenTreeFieldList::Use& use : fieldList->Uses())
+    {
+        GenTree* const fieldNode   = use.GetNode();
+        const unsigned fieldOffset = use.GetOffset();
+        var_types      fieldType   = use.GetType();
+
+        // Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the
+        // field list in descending order by offset.
+        assert(!varTypeIsLong(fieldType));
+        assert(fieldOffset <= prevFieldOffset);
+
+        // Consume the register, if any, for this field. Note that genConsumeRegs() will appropriately
+        // update the liveness info for a lclVar that has been marked RegOptional, which hasn't been
+        // assigned a register, and which is therefore contained.
+        // Unlike genConsumeReg(), it handles the case where no registers are being consumed.
+        genConsumeRegs(fieldNode);
+        regNumber argReg = fieldNode->isUsedFromSpillTemp() ? REG_NA : fieldNode->GetRegNum();
+
+        // If the field is slot-like, we can use a push instruction to store the entire register no matter the type.
+        //
+        // The GC encoder requires that the stack remain 4-byte aligned at all times. Round the adjustment up
+        // to the next multiple of 4. If we are going to generate a `push` instruction, the adjustment must
+        // not require rounding.
+        // NOTE: if the field is of GC type, we must use a push instruction, since the emitter is not otherwise
+        // able to detect stores into the outgoing argument area of the stack on x86.
+        const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
+        int        adjustment  = roundUp(currentOffset - fieldOffset, 4);
+        if (fieldIsSlot && !varTypeIsSIMD(fieldType))
+        {
+            fieldType         = genActualType(fieldType);
+            unsigned pushSize = genTypeSize(fieldType);
+            assert((pushSize % 4) == 0);
+            adjustment -= pushSize;
+            while (adjustment != 0)
+            {
+                inst_IV(INS_push, 0);
+                currentOffset -= pushSize;
+                AddStackLevel(pushSize);
+                adjustment -= pushSize;
+            }
+            m_pushStkArg = true;
+        }
+        else
+        {
+            m_pushStkArg = false;
+
+            // We always "push" floating point fields (i.e. they are full slot values that don't
+            // require special handling).
+            assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
+
+            // If we can't push this field, it needs to be in a register so that we can store
+            // it to the stack location.
+            if (adjustment != 0)
+            {
+                // This moves the stack pointer to fieldOffset.
+                // For this case, we must adjust the stack and generate stack-relative stores rather than pushes.
+                // Adjust the stack pointer to the next slot boundary.
+                inst_RV_IV(INS_sub, REG_SPBASE, adjustment, EA_PTRSIZE);
+                currentOffset -= adjustment;
+                AddStackLevel(adjustment);
+            }
+
+            // Does it need to be in a byte register?
+            // If so, we'll use intTmpReg, which must have been allocated as a byte register.
+            // If it's already in a register, but not a byteable one, then move it.
+            if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
+            {
+                assert(intTmpReg != REG_NA);
+                noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
+                if (argReg != REG_NA)
+                {
+                    inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
+                    argReg = intTmpReg;
+                }
+            }
+        }
+
+        if (argReg == REG_NA)
+        {
+            if (m_pushStkArg)
+            {
+                if (fieldNode->isUsedFromSpillTemp())
+                {
+                    assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
+                    assert(fieldNode->IsRegOptional());
+                    TempDsc* tmp = getSpillTempDsc(fieldNode);
+                    GetEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
+                    regSet.tmpRlsTemp(tmp);
+                }
+                else
+                {
+                    assert(varTypeIsIntegralOrI(fieldNode));
+                    switch (fieldNode->OperGet())
+                    {
+                        case GT_LCL_VAR:
+                            inst_TT(INS_push, fieldNode, 0, 0, emitActualTypeSize(fieldNode->TypeGet()));
+                            break;
+                        case GT_CNS_INT:
+                            if (fieldNode->IsIconHandle())
+                            {
+                                inst_IV_handle(INS_push, fieldNode->AsIntCon()->gtIconVal);
+                            }
+                            else
+                            {
+                                inst_IV(INS_push, fieldNode->AsIntCon()->gtIconVal);
+                            }
+                            break;
+                        default:
+                            unreached();
+                    }
+                }
+                currentOffset -= TARGET_POINTER_SIZE;
+                AddStackLevel(TARGET_POINTER_SIZE);
+            }
+            else
+            {
+                // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
+                assert(varTypeIsIntegralOrI(fieldNode));
+                switch (fieldNode->OperGet())
+                {
+                    case GT_LCL_VAR:
+                        inst_RV_TT(INS_mov, intTmpReg, fieldNode);
+                        break;
+                    case GT_CNS_INT:
+                        genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
+                        break;
+                    default:
+                        unreached();
+                }
+                genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
+            }
+        }
+        else
+        {
+#if defined(FEATURE_SIMD)
+            if (fieldType == TYP_SIMD12)
+            {
+                assert(genIsValidFloatReg(simdTmpReg));
+                genStoreSIMD12ToStack(argReg, simdTmpReg);
+            }
+            else
+#endif // defined(FEATURE_SIMD)
+            {
+                genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+            }
+            if (m_pushStkArg)
+            {
+                // We always push a slot-rounded size
+                currentOffset -= genTypeSize(fieldType);
+            }
+        }
+
+        prevFieldOffset = fieldOffset;
+    }
+    if (currentOffset != 0)
+    {
+        // We don't expect padding at the beginning of a struct, but it could happen with explicit layout.
+        inst_RV_IV(INS_sub, REG_SPBASE, currentOffset, EA_PTRSIZE);
+        AddStackLevel(currentOffset);
+    }
+}
+#endif // TARGET_X86
+
+//---------------------------------------------------------------------
+// genPutArgStk - generate code for passing an arg on the stack.
+//
+// Arguments
+//    treeNode      - the GT_PUTARG_STK node
+//    targetType    - the type of the treeNode
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    assert(false);
+}
+
+//---------------------------------------------------------------------
+// genPutArgReg - generate code for a GT_PUTARG_REG node
+//
+// Arguments
+//    tree - the GT_PUTARG_REG node
+//
+// Return value:
+//    None
+//
+void CodeGen::genPutArgReg(GenTreeOp* tree)
+{
+    assert(tree->OperIs(GT_PUTARG_REG));
+
+    var_types targetType = tree->TypeGet();
+    regNumber targetReg  = tree->GetRegNum();
+
+#ifndef UNIX_AMD64_ABI
+    assert(targetType != TYP_STRUCT);
+#endif // !UNIX_AMD64_ABI
+
+    GenTree* op1 = tree->gtOp1;
+    genConsumeReg(op1);
+
+    // If child node is not already in the register we need, move it
+    if (targetReg != op1->GetRegNum())
+    {
+        inst_RV_RV(ins_Copy(targetType), targetReg, op1->GetRegNum(), targetType);
+    }
+
+    genProduceReg(tree);
+}
+
+#ifdef TARGET_X86
+// genPushReg: Push a register value onto the stack and adjust the stack level
+//
+// Arguments:
+//    type   - the type of value to be stored
+//    reg    - the register containing the value
+//
+// Notes:
+//    For TYP_LONG, the srcReg must be a floating point register.
+//    Otherwise, the register type must be consistent with the given type.
+//
+void CodeGen::genPushReg(var_types type, regNumber srcReg)
+{
+    unsigned size = genTypeSize(type);
+    if (varTypeIsIntegralOrI(type) && type != TYP_LONG)
+    {
+        assert(genIsValidIntReg(srcReg));
+        inst_RV(INS_push, srcReg, type);
+    }
+    else
+    {
+        instruction ins;
+        emitAttr    attr = emitTypeSize(type);
+        if (type == TYP_LONG)
+        {
+            // On x86, the only way we can push a TYP_LONG from a register is if it is in an xmm reg.
+            // This is only used when we are pushing a struct from memory to memory, and basically is
+            // handling an 8-byte "chunk", as opposed to strictly a long type.
+            ins = INS_movq;
+        }
+        else
+        {
+            ins = ins_Store(type);
+        }
+        assert(genIsValidFloatReg(srcReg));
+        inst_RV_IV(INS_sub, REG_SPBASE, size, EA_PTRSIZE);
+        GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, 0);
+    }
+    AddStackLevel(size);
+}
+#endif // TARGET_X86
+
+#if defined(FEATURE_PUT_STRUCT_ARG_STK)
+// genStoreRegToStackArg: Store a register value into the stack argument area
+//
+// Arguments:
+//    type   - the type of value to be stored
+//    reg    - the register containing the value
+//    offset - the offset from the base (see Assumptions below)
+//
+// Notes:
+//    A type of TYP_STRUCT instructs this method to store a 16-byte chunk
+//    at the given offset (i.e. not the full struct).
+//
+// Assumptions:
+//    The caller must set the context appropriately before calling this method:
+//    - On x64, m_stkArgVarNum must be set according to whether this is a regular or tail call.
+//    - On x86, the caller must set m_pushStkArg if this method should push the argument.
+//      Otherwise, the argument is stored at the given offset from sp.
+//
+// TODO: In the below code the load and store instructions are for 16 bytes, but the
+//          type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
+//          this probably needs to be changed.
+//
+void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset)
+{
+    assert(srcReg != REG_NA);
+    instruction ins;
+    emitAttr    attr;
+    unsigned    size;
+
+    if (type == TYP_STRUCT)
+    {
+        ins = INS_movdqu;
+        // This should be changed!
+        attr = EA_8BYTE;
+        size = 16;
+    }
+    else
+    {
+#ifdef FEATURE_SIMD
+        if (varTypeIsSIMD(type))
+        {
+            assert(genIsValidFloatReg(srcReg));
+            ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
+        }
+        else
+#endif // FEATURE_SIMD
+#ifdef TARGET_X86
+            if (type == TYP_LONG)
+        {
+            assert(genIsValidFloatReg(srcReg));
+            ins = INS_movq;
+        }
+        else
+#endif // TARGET_X86
+        {
+            assert((varTypeUsesFloatReg(type) && genIsValidFloatReg(srcReg)) ||
+                   (varTypeIsIntegralOrI(type) && genIsValidIntReg(srcReg)));
+            ins = ins_Store(type);
+        }
+        attr = emitTypeSize(type);
+        size = genTypeSize(type);
+    }
+
+#ifdef TARGET_X86
+    if (m_pushStkArg)
+    {
+        genPushReg(type, srcReg);
+    }
+    else
+    {
+        GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset);
+    }
+#else  // !TARGET_X86
+    assert(m_stkArgVarNum != BAD_VAR_NUM);
+    GetEmitter()->emitIns_S_R(ins, attr, srcReg, m_stkArgVarNum, m_stkArgOffset + offset);
+#endif // !TARGET_X86
+}
+
+//---------------------------------------------------------------------
+// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
+//                In case there are references to heap object in the struct,
+//                it generates the gcinfo as well.
+//
+// Arguments
+//    putArgStk - the GT_PUTARG_STK node
+//
+// Notes:
+//    In the case of fixed out args, the caller must have set m_stkArgVarNum to the variable number
+//    corresponding to the argument area (where we will put the argument on the stack).
+//    For tail calls this is the baseVarNum = 0.
+//    For non tail calls this is the outgoingArgSpace.
+void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
+{
+    assert(false);
+}
+#endif // defined(FEATURE_PUT_STRUCT_ARG_STK)
+
+/*****************************************************************************
+ *
+ *  Create and record GC Info for the function.
+ */
+#ifndef JIT32_GCENCODER
+void
+#else  // !JIT32_GCENCODER
+void*
+#endif // !JIT32_GCENCODER
+CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
+{
+#ifdef JIT32_GCENCODER
+    return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
+#else  // !JIT32_GCENCODER
+    genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
+#endif // !JIT32_GCENCODER
+}
+
+#ifdef JIT32_GCENCODER
+void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
+                                            unsigned prologSize,
+                                            unsigned epilogSize DEBUGARG(void* codePtr))
+{
+    BYTE    headerBuf[64];
+    InfoHdr header;
+
+    int s_cached;
+
+#ifdef FEATURE_EH_FUNCLETS
+    // We should do this before gcInfoBlockHdrSave since varPtrTableSize must be finalized before it
+    if (compiler->ehAnyFunclets())
+    {
+        gcInfo.gcMarkFilterVarsPinned();
+    }
+#endif
+
+#ifdef DEBUG
+    size_t headerSize =
+#endif
+        compiler->compInfoBlkSize =
+            gcInfo.gcInfoBlockHdrSave(headerBuf, 0, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+    size_t argTabOffset = 0;
+    size_t ptrMapSize   = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
+
+#if DISPLAY_SIZES
+
+    if (GetInterruptible())
+    {
+        gcHeaderISize += compiler->compInfoBlkSize;
+        gcPtrMapISize += ptrMapSize;
+    }
+    else
+    {
+        gcHeaderNSize += compiler->compInfoBlkSize;
+        gcPtrMapNSize += ptrMapSize;
+    }
+
+#endif // DISPLAY_SIZES
+
+    compiler->compInfoBlkSize += ptrMapSize;
+
+    /* Allocate the info block for the method */
+
+    compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
+
+#if 0 // VERBOSE_SIZES
+    // TODO-X86-Cleanup: 'dataSize', below, is not defined
+
+//  if  (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
+    {
+        printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
+               compiler->info.compILCodeSize,
+               compiler->compInfoBlkSize,
+               codeSize + dataSize,
+               codeSize + dataSize - prologSize - epilogSize,
+               100 * (codeSize + dataSize) / compiler->info.compILCodeSize,
+               100 * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
+               compiler->info.compClassName,
+               compiler->info.compMethodName);
+}
+
+#endif
+
+    /* Fill in the info block and return it to the caller */
+
+    void* infoPtr = compiler->compInfoBlkAddr;
+
+    /* Create the method info block: header followed by GC tracking tables */
+
+    compiler->compInfoBlkAddr +=
+        gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -1, codeSize, prologSize, epilogSize, &header, &s_cached);
+
+    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
+    compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
+    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
+
+#ifdef DEBUG
+
+    if (0)
+    {
+        BYTE*  temp = (BYTE*)infoPtr;
+        size_t size = compiler->compInfoBlkAddr - temp;
+        BYTE*  ptab = temp + headerSize;
+
+        noway_assert(size == headerSize + ptrMapSize);
+
+        printf("Method info block - header [%zu bytes]:", headerSize);
+
+        for (unsigned i = 0; i < size; i++)
+        {
+            if (temp == ptab)
+            {
+                printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
+                printf("\n    %04X: %*c", i & ~0xF, 3 * (i & 0xF), ' ');
+            }
+            else
+            {
+                if (!(i % 16))
+                    printf("\n    %04X: ", i);
+            }
+
+            printf("%02X ", *temp++);
+        }
+
+        printf("\n");
+    }
+
+#endif // DEBUG
+
+#if DUMP_GC_TABLES
+
+    if (compiler->opts.dspGCtbls)
+    {
+        const BYTE* base = (BYTE*)infoPtr;
+        size_t      size;
+        unsigned    methodSize;
+        InfoHdr     dumpHeader;
+
+        printf("GC Info for method %s\n", compiler->info.compFullName);
+        printf("GC info size = %3u\n", compiler->compInfoBlkSize);
+
+        size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
+        // printf("size of header encoding is %3u\n", size);
+        printf("\n");
+
+        if (compiler->opts.dspGCtbls)
+        {
+            base += size;
+            size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
+            // printf("size of pointer table is %3u\n", size);
+            printf("\n");
+            noway_assert(compiler->compInfoBlkAddr == (base + size));
+        }
+    }
+
+#endif // DUMP_GC_TABLES
+
+    /* Make sure we ended up generating the expected number of bytes */
+
+    noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
+
+    return infoPtr;
+}
+
+#else  // !JIT32_GCENCODER
+void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
+{
+    IAllocator*    allowZeroAlloc = new (compiler, CMK_GC) CompIAllocator(compiler->getAllocatorGC());
+    GcInfoEncoder* gcInfoEncoder  = new (compiler, CMK_GC)
+        GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
+    assert(gcInfoEncoder);
+
+    // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
+    gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
+
+    // We keep the call count for the second call to gcMakeRegPtrTable() below.
+    unsigned callCnt = 0;
+    // First we figure out the encoder ID's for the stack slots and registers.
+    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS, &callCnt);
+    // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
+    gcInfoEncoder->FinalizeSlotIds();
+    // Now we can actually use those slot ID's to declare live ranges.
+    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK, &callCnt);
+
+    if (compiler->opts.compDbgEnC)
+    {
+        // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
+        // which is:
+        //  -return address
+        //  -saved off RBP
+        //  -saved 'this' pointer and bool for synchronized methods
+
+        // 4 slots for RBP + return address + RSI + RDI
+        int preservedAreaSize = 4 * REGSIZE_BYTES;
+
+        if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
+        {
+            if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
+            {
+                preservedAreaSize += REGSIZE_BYTES;
+            }
+
+            // bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
+            preservedAreaSize += 4;
+        }
+
+        // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
+        // frame
+        gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
+    }
+
+    if (compiler->opts.IsReversePInvoke())
+    {
+        unsigned reversePInvokeFrameVarNumber = compiler->lvaReversePInvokeFrameVar;
+        assert(reversePInvokeFrameVarNumber != BAD_VAR_NUM && reversePInvokeFrameVarNumber < compiler->lvaRefCount);
+        LclVarDsc& reversePInvokeFrameVar = compiler->lvaTable[reversePInvokeFrameVarNumber];
+        gcInfoEncoder->SetReversePInvokeFrameSlot(reversePInvokeFrameVar.GetStackOffset());
+    }
+
+    gcInfoEncoder->Build();
+
+    // GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
+    // let's save the values anyway for debugging purposes
+    compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
+    compiler->compInfoBlkSize = 0; // not exposed by the GCEncoder interface
+}
+#endif // !JIT32_GCENCODER
+
+/*****************************************************************************
+ *  Emit a call to a helper function.
+ *
+ */
+
+void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
+{
+    void* addr  = nullptr;
+    void* pAddr = nullptr;
+
+    emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
+    addr                           = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
+    regNumber callTarget           = REG_NA;
+    regMaskTP killMask             = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
+
+    if (!addr)
+    {
+        assert(pAddr != nullptr);
+
+        // Absolute indirect call addr
+        // Note: Order of checks is important. First always check for pc-relative and next
+        // zero-relative.  Because the former encoding is 1-byte smaller than the latter.
+        if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) ||
+            genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
+        {
+            // generate call whose target is specified by 32-bit offset relative to PC or zero.
+            callType = emitter::EC_FUNC_TOKEN_INDIR;
+            addr     = pAddr;
+        }
+        else
+        {
+#ifdef TARGET_AMD64
+            // If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
+            // load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
+            // make the call.
+            //    mov   reg, addr
+            //    call  [reg]
+
+            if (callTargetReg == REG_NA)
+            {
+                // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
+                // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
+                callTargetReg            = REG_DEFAULT_HELPER_CALL_TARGET;
+                regMaskTP callTargetMask = genRegMask(callTargetReg);
+                noway_assert((callTargetMask & killMask) == callTargetMask);
+            }
+            else
+            {
+                // The call target must not overwrite any live variable, though it may not be in the
+                // kill set for the call.
+                regMaskTP callTargetMask = genRegMask(callTargetReg);
+                noway_assert((callTargetMask & regSet.GetMaskVars()) == RBM_NONE);
+            }
+#endif
+
+            callTarget = callTargetReg;
+            CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
+            callType = emitter::EC_INDIR_ARD;
+        }
+    }
+
+    // clang-format off
+    GetEmitter()->emitIns_Call(callType,
+                               compiler->eeFindHelper(helper),
+                               INDEBUG_LDISASM_COMMA(nullptr) addr,
+                               argSize,
+                               retSize
+                               MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(EA_UNKNOWN),
+                               gcInfo.gcVarPtrSetCur,
+                               gcInfo.gcRegGCrefSetCur,
+                               gcInfo.gcRegByrefSetCur,
+                               BAD_IL_OFFSET, // IL offset
+                               callTarget,    // ireg
+                               REG_NA, 0, 0,  // xreg, xmul, disp
+                               false         // isJump
+                               );
+    // clang-format on
+
+    regSet.verifyRegistersUsed(killMask);
+}
+
+/*****************************************************************************
+* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
+* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
+* disassembler thinks the instructions as the same as we do.
+*/
+
+// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
+// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
+//#define ALL_XARCH_EMITTER_UNIT_TESTS
+
+#if defined(DEBUG) && defined(LATE_DISASM) && defined(TARGET_AMD64)
+void CodeGen::genAmd64EmitterUnitTests()
+{
+    if (!verbose)
+    {
+        return;
+    }
+
+    if (!compiler->opts.altJit)
+    {
+        // No point doing this in a "real" JIT.
+        return;
+    }
+
+    // Mark the "fake" instructions in the output.
+    printf("*************** In genAmd64EmitterUnitTests()\n");
+
+    // We use this:
+    //      genDefineTempLabel(genCreateTempLabel());
+    // to create artificial labels to help separate groups of tests.
+
+    //
+    // Loads
+    //
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
+    genDefineTempLabel(genCreateTempLabel());
+
+    // vhaddpd     ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddps      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddpd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vaddpd      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubpd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vsubpd      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulps      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulpd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vmulpd      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandps      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandpd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vandpd      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorps      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorpd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorps      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vorpd      ymm0,ymm1,ymm2
+    GetEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+
+    // vdivss      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+    // vdivsd      xmm0,xmm1,xmm2
+    GetEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
+#endif // ALL_XARCH_EMITTER_UNIT_TESTS
+    printf("*************** End of genAmd64EmitterUnitTests()\n");
+}
+
+#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(TARGET_AMD64)
+
+#ifdef PROFILING_SUPPORTED
+
+#ifdef TARGET_X86
+
+//-----------------------------------------------------------------------------------
+// genProfilingEnterCallback: Generate the profiling function enter callback.
+//
+// Arguments:
+//     initReg        - register to use as scratch register
+//     pInitRegZeroed - OUT parameter. This variable remains unchanged.
+//
+// Return Value:
+//     None
+//
+// Notes:
+// The x86 profile enter helper has the following requirements (see ProfileEnterNaked in
+// VM\i386\asmhelpers.asm for details):
+// 1. The calling sequence for calling the helper is:
+//          push FunctionIDOrClientID
+//          call ProfileEnterHelper
+// 2. The calling function has an EBP frame.
+// 3. EBP points to the saved ESP which is the first thing saved in the function. Thus,
+//    the following prolog is assumed:
+//          push ESP
+//          mov EBP, ESP
+// 4. All registers are preserved.
+// 5. The helper pops the FunctionIDOrClientID argument from the stack.
+//
+void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    // Give profiler a chance to back out of hooking this method
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+    unsigned saveStackLvl2 = genStackLevel;
+
+// Important note: when you change enter probe layout, you must also update SKIP_ENTER_PROF_CALLBACK()
+// for x86 stack unwinding
+
+#if defined(UNIX_X86_ABI)
+    // Manually align the stack to be 16-byte aligned. This is similar to CodeGen::genAlignStackBeforeCall()
+    GetEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_SPBASE, 0xC);
+#endif // UNIX_X86_ABI
+
+    // Push the profilerHandle
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        GetEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
+    }
+
+    // This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov rax, helper addr; call rax"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER,
+                      0,           // argSize. Again, we have to lie about it
+                      EA_UNKNOWN); // retSize
+
+    // Check that we have place for the push.
+    assert(compiler->fgGetPtrArgCntMax() >= 1);
+
+#if defined(UNIX_X86_ABI)
+    // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall
+    GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10);
+#endif // UNIX_X86_ABI
+
+    /* Restore the stack level */
+
+    SetStackLevel(saveStackLvl2);
+}
+
+//-----------------------------------------------------------------------------------
+// genProfilingLeaveCallback: Generate the profiling function leave or tailcall callback.
+// Technically, this is not part of the epilog; it is called when we are generating code for a GT_RETURN node.
+//
+// Arguments:
+//     helper - which helper to call. Either CORINFO_HELP_PROF_FCN_LEAVE or CORINFO_HELP_PROF_FCN_TAILCALL
+//
+// Return Value:
+//     None
+//
+// Notes:
+// The x86 profile leave/tailcall helper has the following requirements (see ProfileLeaveNaked and
+// ProfileTailcallNaked in VM\i386\asmhelpers.asm for details):
+// 1. The calling sequence for calling the helper is:
+//          push FunctionIDOrClientID
+//          call ProfileLeaveHelper or ProfileTailcallHelper
+// 2. The calling function has an EBP frame.
+// 3. EBP points to the saved ESP which is the first thing saved in the function. Thus,
+//    the following prolog is assumed:
+//          push ESP
+//          mov EBP, ESP
+// 4. helper == CORINFO_HELP_PROF_FCN_LEAVE: All registers are preserved.
+//    helper == CORINFO_HELP_PROF_FCN_TAILCALL: Only argument registers are preserved.
+// 5. The helper pops the FunctionIDOrClientID argument from the stack.
+//
+void CodeGen::genProfilingLeaveCallback(unsigned helper)
+{
+    assert((helper == CORINFO_HELP_PROF_FCN_LEAVE) || (helper == CORINFO_HELP_PROF_FCN_TAILCALL));
+
+    // Only hook if profiler says it's okay.
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+    compiler->info.compProfilerCallback = true;
+
+    // Need to save on to the stack level, since the helper call will pop the argument
+    unsigned saveStackLvl2 = genStackLevel;
+
+#if defined(UNIX_X86_ABI)
+    // Manually align the stack to be 16-byte aligned. This is similar to CodeGen::genAlignStackBeforeCall()
+    GetEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_SPBASE, 0xC);
+    AddStackLevel(0xC);
+    AddNestedAlignment(0xC);
+#endif // UNIX_X86_ABI
+
+    //
+    // Push the profilerHandle
+    //
+
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        GetEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
+    }
+    genSinglePush();
+
+#if defined(UNIX_X86_ABI)
+    int argSize = -REGSIZE_BYTES; // negative means caller-pop (cdecl)
+#else
+    int argSize = REGSIZE_BYTES;
+#endif
+    genEmitHelperCall(helper, argSize, EA_UNKNOWN /* retSize */);
+
+    // Check that we have place for the push.
+    assert(compiler->fgGetPtrArgCntMax() >= 1);
+
+#if defined(UNIX_X86_ABI)
+    // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall
+    GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10);
+    SubtractStackLevel(0x10);
+    SubtractNestedAlignment(0xC);
+#endif // UNIX_X86_ABI
+
+    /* Restore the stack level */
+    SetStackLevel(saveStackLvl2);
+}
+
+#endif // TARGET_X86
+
+
+//-----------------------------------------------------------------------------------
+// genProfilingEnterCallback: Generate the profiling function enter callback.
+//
+// Arguments:
+//     initReg        - register to use as scratch register
+//     pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
+//                      this call sets 'initReg' to a non-zero value.
+//
+// Return Value:
+//     None
+//
+void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
+{
+    assert(compiler->compGeneratingProlog);
+
+    // Give profiler a chance to back out of hooking this method
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+#if !defined(UNIX_AMD64_ABI)
+
+    unsigned   varNum;
+    LclVarDsc* varDsc;
+
+    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
+    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
+
+    // Home all arguments passed in arg registers (RCX, RDX, R8 and R9).
+    // In case of vararg methods, arg regs are already homed.
+    //
+    // Note: Here we don't need to worry about updating gc'info since enter
+    // callback is generated as part of prolog which is non-gc interruptible.
+    // Moreover GC cannot kick while executing inside profiler callback which is a
+    // profiler requirement so it can examine arguments which could be obj refs.
+    if (!compiler->info.compIsVarArgs)
+    {
+        for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
+        {
+            noway_assert(varDsc->lvIsParam);
+
+            if (!varDsc->lvIsRegArg)
+            {
+                continue;
+            }
+
+            var_types storeType = varDsc->lvaArgType();
+            regNumber argReg    = varDsc->GetArgReg();
+
+            instruction store_ins = ins_Store(storeType);
+
+#ifdef FEATURE_SIMD
+            if ((storeType == TYP_SIMD8) && genIsValidIntReg(argReg))
+            {
+                store_ins = INS_mov;
+            }
+#endif // FEATURE_SIMD
+
+            GetEmitter()->emitIns_S_R(store_ins, emitTypeSize(storeType), argReg, varNum, 0);
+        }
+    }
+
+    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
+    // RCX = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of a pointer.
+        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // No need to record relocations, if we are generating ELT hooks under the influence
+        // of COMPlus_JitELTHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RDX = caller's SP
+    // Notes
+    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
+    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
+    //      of that offset to FramePointer to obtain caller's SP value.
+    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+    GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+
+    // This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov rax, helper addr; call rax"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN);
+
+    // TODO-AMD64-CQ: Rather than reloading, see if this could be optimized by combining with prolog
+    // generation logic that moves args around as required by first BB entry point conditions
+    // computed by LSRA.  Code pointers for investigating this further: genFnPrologCalleeRegArgs()
+    // and genEnregisterIncomingStackArgs().
+    //
+    // Now reload arg registers from home locations.
+    // Vararg methods:
+    //   - we need to reload only known (i.e. fixed) reg args.
+    //   - if floating point type, also reload it into corresponding integer reg
+    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
+    {
+        noway_assert(varDsc->lvIsParam);
+
+        if (!varDsc->lvIsRegArg)
+        {
+            continue;
+        }
+
+        var_types loadType = varDsc->lvaArgType();
+        regNumber argReg   = varDsc->GetArgReg();
+
+        instruction load_ins = ins_Load(loadType);
+
+#ifdef FEATURE_SIMD
+        if ((loadType == TYP_SIMD8) && genIsValidIntReg(argReg))
+        {
+            load_ins = INS_mov;
+        }
+#endif // FEATURE_SIMD
+
+        GetEmitter()->emitIns_R_S(load_ins, emitTypeSize(loadType), argReg, varNum, 0);
+
+#if FEATURE_VARARG
+        if (compiler->info.compIsVarArgs && varTypeIsFloating(loadType))
+        {
+            regNumber   intArgReg = compiler->getCallArgIntRegister(argReg);
+            instruction ins       = ins_CopyFloatToInt(loadType, TYP_LONG);
+            inst_RV_RV(ins, argReg, intArgReg, loadType);
+        }
+#endif //  FEATURE_VARARG
+    }
+
+    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
+    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
+    {
+        *pInitRegZeroed = false;
+    }
+
+#else // !defined(UNIX_AMD64_ABI)
+
+    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
+    // R14 = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of a pointer.
+        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_PROFILER_ENTER_ARG_0,
+                                   (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // No need to record relocations, if we are generating ELT hooks under the influence
+        // of COMPlus_JitELTHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // R15 = caller's SP
+    // Notes
+    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
+    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
+    //      of that offset to FramePointer to obtain caller's SP value.
+    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+    GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_PROFILER_ENTER_ARG_1, genFramePointerReg(), -callerSPOffset);
+
+    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
+    // We use R11 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r11, helper addr; call r11"
+    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
+
+    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
+    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
+    {
+        *pInitRegZeroed = false;
+    }
+
+#endif // !defined(UNIX_AMD64_ABI)
+}
+
+//-----------------------------------------------------------------------------------
+// genProfilingLeaveCallback: Generate the profiling function leave or tailcall callback.
+// Technically, this is not part of the epilog; it is called when we are generating code for a GT_RETURN node.
+//
+// Arguments:
+//     helper - which helper to call. Either CORINFO_HELP_PROF_FCN_LEAVE or CORINFO_HELP_PROF_FCN_TAILCALL
+//
+// Return Value:
+//     None
+//
+void CodeGen::genProfilingLeaveCallback(unsigned helper)
+{
+    assert((helper == CORINFO_HELP_PROF_FCN_LEAVE) || (helper == CORINFO_HELP_PROF_FCN_TAILCALL));
+
+    // Only hook if profiler says it's okay.
+    if (!compiler->compIsProfilerHookNeeded())
+    {
+        return;
+    }
+
+    compiler->info.compProfilerCallback = true;
+
+#if !defined(UNIX_AMD64_ABI)
+
+    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
+    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
+    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
+
+    // If thisPtr needs to be kept alive and reported, it cannot be one of the callee trash
+    // registers that profiler callback kills.
+    if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvIsInReg())
+    {
+        regMaskTP thisPtrMask = genRegMask(compiler->lvaTable[compiler->info.compThisArg].GetRegNum());
+        noway_assert((RBM_PROFILER_LEAVE_TRASH & thisPtrMask) == 0);
+    }
+
+    // At this point return value is computed and stored in RAX or XMM0.
+    // On Amd64, Leave callback preserves the return register.  We keep
+    // RAX alive by not reporting as trashed by helper call.  Also note
+    // that GC cannot kick-in while executing inside profiler callback,
+    // which is a requirement of profiler as well since it needs to examine
+    // return value which could be an obj ref.
+
+    // RCX = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        // Profiler hooks enabled during Ngen time.
+        // Profiler handle needs to be accessed through an indirection of an address.
+        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        // Don't record relocations, if we are generating ELT hooks under the influence
+        // of COMPlus_JitELTHookEnabled=1
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RDX = caller's SP
+    // TODO-AMD64-Cleanup: Once we start doing codegen after final frame layout, retain the "if" portion
+    // of the stmnts to execute unconditionally and clean-up rest.
+    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
+    {
+        // Caller's SP relative offset to FramePointer will be negative.  We need to add absolute
+        // value of that offset to FramePointer to obtain caller's SP value.
+        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+    }
+    else
+    {
+        // If we are here means that it is a tentative frame layout during which we
+        // cannot use caller's SP offset since it is an estimate.  For now we require the
+        // method to have at least a single arg so that we can use it to obtain caller's
+        // SP.
+        LclVarDsc* varDsc = compiler->lvaTable;
+        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
+
+        // lea rdx, [FramePointer + Arg0's offset]
+        GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
+    }
+
+    // We can use any callee trash register (other than RAX, RCX, RDX) for call target.
+    // We use R8 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r8, helper addr; call r8"
+    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_ARG_2);
+
+#else // !defined(UNIX_AMD64_ABI)
+
+    // RDI = ProfilerMethHnd
+    if (compiler->compProfilerMethHndIndirected)
+    {
+        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+    }
+    else
+    {
+        if (compiler->opts.compJitELTHookEnabled)
+        {
+            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
+        }
+        else
+        {
+            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
+        }
+    }
+
+    // RSI = caller's SP
+    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
+    {
+        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
+        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
+    }
+    else
+    {
+        LclVarDsc* varDsc = compiler->lvaTable;
+        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
+
+        // lea rdx, [FramePointer + Arg0's offset]
+        GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
+    }
+
+    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
+    // We use R11 here. This will emit either
+    // "call ip-relative 32-bit offset" or
+    // "mov r11, helper addr; call r11"
+    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
+
+#endif // !defined(UNIX_AMD64_ABI)
+}
+
+
+#endif // PROFILING_SUPPORTED
+
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 3a2b32c02001..429a0b65ca43 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4371,12 +4371,13 @@ void Compiler::EndPhase(Phases phase)
     mostRecentlyActivePhase = phase;
 }
 
-
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
 inline void DoLlvmPhase(Compiler* _compiler)
 {
     fatal(CORJIT_SKIPPED);
     //assert(false);
 }
+#endif
 
 //------------------------------------------------------------------------
 // compCompile: run phases needed for compilation
@@ -5053,8 +5054,10 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
     Rationalizer rat(this); // PHASE_RATIONALIZE
     rat.Run();
 
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
     // TODO:after rat, but better before?
     DoLlvmPhase(this); // DoPhase?
+#endif
 
     return;
     // Here we do "simple lowering".  When the RyuJIT backend works for all
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 3feabf558b94..894f567a1f2f 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -1815,7 +1815,7 @@ struct fgArgTabEntry
         unsigned roundedByteSize = roundUp(byteSize, TARGET_POINTER_SIZE);
 #endif // OSX_ARM64_ABI
 
-#if !defined(TARGET_ARM)
+#if !defined(TARGET_ARM) && !defined(TARGET_WASM32)
         // Arm32 could have a struct with 8 byte alignment
         // which rounded size % 8 is not 0.
         assert(m_byteAlignment != 0);
@@ -7530,6 +7530,17 @@ class Compiler
 #elif defined(TARGET_ARM64)
             reg     = REG_R11;
             regMask = RBM_R11;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)  //TODO: empty better?
+            if (isCoreRTABI)
+            {
+                reg = REG_R10;
+                regMask = RBM_R10;
+            }
+            else
+            {
+                reg = REG_R11;
+                regMask = RBM_R11;
+            }
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -7977,7 +7988,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
     UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func);
 
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: delete?
 
     void unwindBegPrologWindows();
     void unwindPushWindows(regNumber reg);
@@ -8034,7 +8045,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // Get highest available level for SIMD codegen
     SIMDLevel getSIMDSupportLevel()
     {
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         if (compOpportunisticallyDependsOn(InstructionSet_AVX2))
         {
             return SIMD_AVX2_Supported;
@@ -9626,7 +9637,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // In case of Amd64 this doesn't include float regs saved on stack.
     unsigned compCalleeRegsPushed;
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
     // Mask of callee saved float regs on stack.
     regMaskTP compCalleeFPRegsSavedMask;
 #endif
@@ -11228,6 +11239,27 @@ const instruction INS_SQRT = INS_fsqrt;
 
 #endif // TARGET_ARM64
 
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+const instruction INS_SHIFT_LEFT_LOGICAL = INS_shl;
+const instruction INS_SHIFT_RIGHT_LOGICAL = INS_shr;
+const instruction INS_SHIFT_RIGHT_ARITHM = INS_sar;
+
+const instruction INS_AND = INS_and;
+const instruction INS_OR = INS_or;
+const instruction INS_XOR = INS_xor;
+const instruction INS_NEG = INS_neg;
+const instruction INS_TEST = INS_test;
+const instruction INS_MUL = INS_imul;
+const instruction INS_SIGNED_DIVIDE = INS_idiv;
+const instruction INS_UNSIGNED_DIVIDE = INS_div;
+const instruction INS_BREAKPOINT = INS_int3;
+const instruction INS_ADDC = INS_adc;
+const instruction INS_SUBC = INS_sbb;
+const instruction INS_NOT = INS_not;
+
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
 /*****************************************************************************/
 
 extern const BYTE genTypeSizes[];
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index b42111611504..d136b4b570fc 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -2172,7 +2172,7 @@ bool emitter::emitHasEpilogEnd()
 
 #endif // JIT32_GCENCODER
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
 /*****************************************************************************
  *
@@ -3274,6 +3274,9 @@ const size_t hexEncodingSize = 19;
 #elif defined(TARGET_ARM)
 const size_t basicIndent     = 12;
 const size_t hexEncodingSize = 11;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+const size_t basicIndent = 7;
+const size_t hexEncodingSize = 21;
 #endif
 
 #ifdef DEBUG
@@ -4474,6 +4477,8 @@ void emitter::emitJumpDistBind()
         // The size of IF_LARGEJMP/IF_LARGEADR/IF_LARGELDC are 8 or 12.
         // All other code size is 4.
         assert((sizeDif == 4) || (sizeDif == 8));
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+        jmp->idCodeSize(jsz);
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -5908,6 +5913,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #elif defined(TARGET_ARM64)
                     assert(!jmp->idAddr()->iiaHasInstrCount());
                     emitOutputLJ(NULL, adr, jmp);
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+                    * (BYTE*)adr -= (BYTE)adj;
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -5916,7 +5923,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                 {
                     // Patch Forward non-Short Jump
                     CLANG_FORMAT_COMMENT_ANCHOR;
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
                     *(int*)adr -= adj;
 #elif defined(TARGET_ARMARCH)
                     assert(!jmp->idAddr()->iiaHasInstrCount());
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index a6000e2e498f..75da1c222b26 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -576,6 +576,10 @@ class emitter
 #elif defined(TARGET_ARM64)
         static_assert_no_msg(INS_count <= 512);
         instruction _idIns : 9;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO ??
+        static_assert_no_msg(INS_count <= 1024);
+        instruction _idIns : 10;
+#define MAX_ENCODED_SIZE 15
 #else  // !(defined(TARGET_XARCH) || defined(TARGET_ARM64))
         static_assert_no_msg(INS_count <= 256);
         instruction _idIns : 8;
@@ -585,6 +589,9 @@ class emitter
 #if defined(TARGET_XARCH)
         static_assert_no_msg(IF_COUNT <= 128);
         insFormat _idInsFmt : 7;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+        static_assert_no_msg(IF_COUNT <= 128);
+        insFormat _idInsFmt : 7;
 #else
         static_assert_no_msg(IF_COUNT <= 256);
         insFormat _idInsFmt : 8;
@@ -635,6 +642,12 @@ class emitter
                                   // doesn't cross a byte boundary.
 #elif defined(TARGET_ARM64)
 // Moved the definition of '_idOpSize' later so that we don't cross a 32-bit boundary when laying out bitfields
+
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: wasm?
+        unsigned _idCodeSize : 4; // size of instruction in bytes. Max size of an Intel instruction is 15 bytes.
+        opSize   _idOpSize : 3;   // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16, 5=32
+                                  // At this point we have fully consumed first DWORD so that next field
+                                  // doesn't cross a byte boundary.
 #else  // ARM
         opSize      _idOpSize : 2; // operand size: 0=1 , 1=2 , 2=4 , 3=8
 #endif // ARM
@@ -701,6 +714,9 @@ class emitter
 #elif defined(TARGET_XARCH)
                                    // For xarch, we have used 14 bits from the second DWORD.
 #define ID_EXTRA_BITFIELD_BITS (14)
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+                                   // TODO: delete?
+#define ID_EXTRA_BITFIELD_BITS (14)
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -852,7 +868,13 @@ class emitter
                 regNumber _idReg3 : REGNUM_BITS;
                 regNumber _idReg4 : REGNUM_BITS;
             };
-#endif // defined(TARGET_XARCH)
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: delete?
+            struct
+            {
+                regNumber _idReg3 : REGNUM_BITS;
+                regNumber _idReg4 : REGNUM_BITS;
+            };
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
         } _idAddrUnion;
 
@@ -950,7 +972,29 @@ class emitter
             _idInsFlags = sf;
             assert(sf == _idInsFlags);
         }
-#endif // TARGET_ARM
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+        unsigned idCodeSize() const
+        {
+            return _idCodeSize;
+        }
+        void idCodeSize(unsigned sz)
+        {
+            if (sz > 15)
+            {
+                // This is a temporary workaround for non-precise instr size
+                // estimator on XARCH. It often overestimates sizes and can
+                // return value more than 15 that doesn't fit in 4 bits _idCodeSize.
+                // If somehow we generate instruction that needs more than 15 bytes we
+                // will fail on another assert in emit.cpp: noway_assert(id->idCodeSize() >= csz).
+                // Issue https://github.com/dotnet/runtime/issues/12840.
+                sz = 15;
+            }
+            assert(sz <= 15); // Intel decoder limit.
+            _idCodeSize = sz;
+            assert(sz == _idCodeSize);
+        }
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
         emitAttr idOpSize()
         {
@@ -1318,6 +1362,24 @@ class emitter
 #define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_1C
 #define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_4C
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+// a read,write or modify from stack location, possible def to use latency from L0 cache
+#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_5C
+
+// a read, write or modify from constant location, possible def to use latency from L0 cache
+#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_5C
+
+// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
+// plus an extra cost  (of 1.0) for a increased chance  of a cache miss
+#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_6C
+
 #endif // TARGET_XXX
 
 // Make this an enum:
@@ -1419,6 +1481,21 @@ class emitter
 
 #endif // TARGET_XARCH
 
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // copying AMD64
+
+    struct instrDescAmd : instrDesc // large addrmode disp
+    {
+        ssize_t idaAmdVal;
+    };
+
+    struct instrDescCnsAmd : instrDesc // large cons + addrmode disp
+    {
+        ssize_t idacCnsVal;
+        ssize_t idacAmdVal;
+    };
+
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
     struct instrDescCGCA : instrDesc // call with ...
     {
         VARSET_TP idcGCvars;    // ... updated GC vars or
@@ -1472,7 +1549,7 @@ class emitter
     size_t emitGetInstrDescSize(const instrDesc* id);
     size_t emitGetInstrDescSizeSC(const instrDesc* id);
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
     ssize_t emitGetInsCns(instrDesc* id);
     ssize_t emitGetInsDsp(instrDesc* id);
@@ -1546,7 +1623,7 @@ class emitter
     unsigned       emitEpilogCnt;
     UNATIVE_OFFSET emitEpilogSize;
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
     void           emitStartExitSeq(); // Mark the start of the "return" sequence
     emitLocation   emitExitSeqBegLoc;
diff --git a/src/coreclr/jit/emitdef.h b/src/coreclr/jit/emitdef.h
index c9f003ccce1b..cde967a26fc4 100644
--- a/src/coreclr/jit/emitdef.h
+++ b/src/coreclr/jit/emitdef.h
@@ -12,6 +12,8 @@
 #include "emitarm.h"
 #elif defined(TARGET_ARM64)
 #include "emitarm64.h"
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#include "emitwasm.h"
 #else
 #error Unsupported or unset target architecture
 #endif
diff --git a/src/coreclr/jit/emitfmts.h b/src/coreclr/jit/emitfmts.h
index c252c0b1237d..e9c56bd9bff4 100644
--- a/src/coreclr/jit/emitfmts.h
+++ b/src/coreclr/jit/emitfmts.h
@@ -8,6 +8,8 @@
 #include "emitfmtsarm.h"
 #elif defined(TARGET_ARM64)
 #include "emitfmtsarm64.h"
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: need anything here?  Try removing
+#include "emitfmtswasm.h"
 #else
 #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/emitfmtswasm.h b/src/coreclr/jit/emitfmtswasm.h
new file mode 100644
index 000000000000..09c674ffba30
--- /dev/null
+++ b/src/coreclr/jit/emitfmtswasm.h
@@ -0,0 +1,218 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+//////////////////////////////////////////////////////////////////////////////
+
+//
+//  This file was previously known as emitfmts.h
+//
+
+// clang-format off
+#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
+  #error Unexpected target type
+#endif
+
+#ifdef  DEFINE_ID_OPS
+//////////////////////////////////////////////////////////////////////////////
+
+#undef  DEFINE_ID_OPS
+
+enum    ID_OPS
+{
+    ID_OP_NONE,                             // no additional arguments
+    ID_OP_SCNS,                             // small const  operand (21-bits or less, no reloc)
+    ID_OP_CNS,                              // constant     operand
+    ID_OP_DSP,                              // displacement operand
+    ID_OP_DSP_CNS,                          // displacement + constant
+    ID_OP_AMD,                              // addrmode with dsp
+    ID_OP_AMD_CNS,                          // addrmode with dsp + constant
+    ID_OP_JMP,                              // local jump
+    ID_OP_LBL,                              // label operand
+    ID_OP_CALL,                             // direct method call
+    ID_OP_SPEC,                             // special handling required
+};
+
+//////////////////////////////////////////////////////////////////////////////
+#else // !DEFINE_ID_OPS
+//////////////////////////////////////////////////////////////////////////////
+
+#ifdef  DEFINE_IS_OPS
+#undef  DEFINE_IS_OPS
+
+#else // DEFINE_IS_OPS
+
+//////////////////////////////////////////////////////////////////////////////
+
+#ifndef IF_DEF
+#error  Must define IF_DEF macro before including this file
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// A note on the naming convention for instruction forms (IF_xxxxx).
+// For 3-character code XYY, generally we have:
+//      X =
+//          R - register
+//          M - memory
+//          S - stack
+//          A - address mode
+//      YY =
+//          RD - read
+//          WR - write
+//          RW - read / write
+//
+// The following sequences don't follow this pattern:
+//      XYY =
+//          CNS - constant
+//          SHF - shift-constant
+//
+// For IF_XXX_YYY, the first operand is XXX, the second operand is YYY.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// enum insFormat   instruction            enum ID_OPS
+//                  scheduling
+//                  (unused)
+//////////////////////////////////////////////////////////////////////////////
+
+IF_DEF(NONE,        IS_NONE,                    NONE)     // no operands
+
+IF_DEF(LABEL,       IS_NONE,                    JMP )     // label
+IF_DEF(RWR_LABEL,   IS_R1_WR,                   JMP )     // write label to register
+IF_DEF(SWR_LABEL,   IS_SF_WR,                   LBL )     // write label to stack
+
+IF_DEF(METHOD,      IS_NONE,                    CALL)     // method
+IF_DEF(METHPTR,     IS_NONE,                    CALL)     // method ptr (glbl)
+
+IF_DEF(CNS,         IS_NONE,                    SCNS)     // const
+
+//----------------------------------------------------------------------------
+// NOTE: The order of the "RD/WR/RW" varieties must match that of
+//       the "insUpdateModes" enum in "instr.h".
+//----------------------------------------------------------------------------
+
+IF_DEF(RRD,         IS_R1_RD,                   NONE)     // read   reg
+IF_DEF(RWR,         IS_R1_WR,                   NONE)     // write  reg
+IF_DEF(RRW,         IS_R1_RW,                   NONE)     // r/w    reg
+
+IF_DEF(RRD_CNS,     IS_R1_RD,                   SCNS)     // read   reg , const
+IF_DEF(RWR_CNS,     IS_R1_WR,                   SCNS)     // write  reg , const
+IF_DEF(RRW_CNS,     IS_R1_RW,                   SCNS)     // r/w    reg , const
+IF_DEF(RRW_SHF,     IS_R1_RW,                   SCNS)     // r/w    reg , shift-const
+
+IF_DEF(RRD_RRD,     IS_R1_RD|IS_R2_RD,          NONE)     // read   reg , read reg2
+IF_DEF(RWR_RRD,     IS_R1_WR|IS_R2_RD,          NONE)     // write  reg , read reg2
+IF_DEF(RRW_RRD,     IS_R1_RW|IS_R2_RD,          NONE)     // r/w    reg , read reg2
+IF_DEF(RRW_RRW,     IS_R1_RW|IS_R2_RW,          NONE)     // r/w    reg , r/w reg2 - for XCHG reg, reg2
+IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW,          SCNS)     // r/w    reg , r/w  reg2 , const
+
+IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE)     // write  reg , read reg2 , read reg3
+IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write  reg , read reg2 , read reg3, const
+
+IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, CNS)     // write  reg , read reg2 , read reg3 , read reg4
+//----------------------------------------------------------------------------
+// The following formats are used for direct addresses (e.g. static data members)
+//----------------------------------------------------------------------------
+
+IF_DEF(MRD,         IS_GM_RD,                   SPEC)     // read  [mem] (indirect call req. SPEC)
+IF_DEF(MWR,         IS_GM_WR,                   DSP)      // write [mem]
+IF_DEF(MRW,         IS_GM_RW,                   DSP)      // r/w   [mem]
+IF_DEF(MRD_OFF,     IS_GM_RD,                   DSP)      // offset mem
+
+IF_DEF(RRD_MRD,     IS_GM_RD|IS_R1_RD,          DSP)      // read   reg , read [mem]
+IF_DEF(RWR_MRD,     IS_GM_RD|IS_R1_WR,          DSP)      // write  reg , read [mem]
+IF_DEF(RRW_MRD,     IS_GM_RD|IS_R1_RW,          DSP)      // r/w    reg , read [mem]
+IF_DEF(RRW_MRD_CNS, IS_GM_RD|IS_R1_RW,          DSP_CNS)  // r/w    reg , read [mem], const
+
+IF_DEF(RWR_RRD_MRD, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP)      // write  reg , read reg2 , read [mem]
+IF_DEF(RWR_MRD_CNS, IS_GM_RD|IS_R1_WR,          DSP_CNS)  // write  reg , read [mem], const
+IF_DEF(RWR_RRD_MRD_CNS, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP_CNS) // write  reg , read reg2 , read [mem], const
+IF_DEF(RWR_RRD_MRD_RRD, IS_GM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, DSP_CNS) // write  reg , read reg2 , read [mem], read reg3
+IF_DEF(RWR_MRD_OFF, IS_GM_RD|IS_R1_WR,          DSP)      // write  reg , offset mem
+
+IF_DEF(MRD_RRD,     IS_GM_RD|IS_R1_RD,          DSP)      // read  [mem], read  reg
+IF_DEF(MWR_RRD,     IS_GM_WR|IS_R1_RD,          DSP)      // write [mem], read  reg
+IF_DEF(MRW_RRD,     IS_GM_RW|IS_R1_RD,          DSP)      // r/w   [mem], read  reg
+
+IF_DEF(MRD_CNS,     IS_GM_RD,                   DSP_CNS)  // read  [mem], const
+IF_DEF(MWR_CNS,     IS_GM_WR,                   DSP_CNS)  // write [mem], const
+IF_DEF(MRW_CNS,     IS_GM_RW,                   DSP_CNS)  // r/w   [mem], const
+
+IF_DEF(MWR_RRD_CNS, IS_GM_WR|IS_R1_RD,          DSP_CNS)  // write [mem], read reg, const
+
+IF_DEF(MRW_SHF,     IS_GM_RW,                   DSP_CNS)  // shift [mem], const
+
+//----------------------------------------------------------------------------
+// The following formats are used for stack frame refs
+//----------------------------------------------------------------------------
+
+IF_DEF(SRD,         IS_SF_RD,                   SPEC)     // read  [stk] (indirect call req. SPEC)
+IF_DEF(SWR,         IS_SF_WR,                   NONE)     // write [stk]
+IF_DEF(SRW,         IS_SF_RW,                   NONE)     // r/w   [stk]
+
+IF_DEF(RRD_SRD,     IS_SF_RD|IS_R1_RD,          NONE)     // read   reg , read [stk]
+IF_DEF(RWR_SRD,     IS_SF_RD|IS_R1_WR,          NONE)     // write  reg , read [stk]
+IF_DEF(RRW_SRD,     IS_SF_RD|IS_R1_RW,          NONE)     // r/w    reg , read [stk]
+IF_DEF(RRW_SRD_CNS, IS_SF_RD|IS_R1_RW,          CNS )     // r/w    reg , read [stk], const
+
+IF_DEF(RWR_RRD_SRD, IS_SF_RD|IS_R1_WR|IS_R2_RD, NONE)     // write  reg , read  reg2, read [stk]
+IF_DEF(RWR_SRD_CNS, IS_SF_RD|IS_R1_WR,          CNS )     // write  reg , read [stk], const
+IF_DEF(RWR_RRD_SRD_CNS, IS_SF_RD|IS_R1_WR|IS_R2_RD, CNS ) // write  reg , read  reg2, read [stk], const
+IF_DEF(RWR_RRD_SRD_RRD, IS_SF_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, CNS ) // write  reg , read  reg2, read [stk], read reg3
+
+IF_DEF(SRD_RRD,     IS_SF_RD|IS_R1_RD,          NONE)     // read  [stk], read  reg
+IF_DEF(SWR_RRD,     IS_SF_WR|IS_R1_RD,          NONE)     // write [stk], read  reg
+IF_DEF(SRW_RRD,     IS_SF_RW|IS_R1_RD,          NONE)     // r/w   [stk], read  reg
+
+IF_DEF(SRD_CNS,     IS_SF_RD,                   CNS )     // read  [stk], const
+IF_DEF(SWR_CNS,     IS_SF_WR,                   CNS )     // write [stk], const
+IF_DEF(SRW_CNS,     IS_SF_RW,                   CNS )     // r/w   [stk], const
+
+IF_DEF(SWR_RRD_CNS, IS_AM_WR|IS_R1_RD,          AMD_CNS)  // write [stk], read reg, const
+
+IF_DEF(SRW_SHF,     IS_SF_RW,                   CNS )     // shift [stk], const
+
+//----------------------------------------------------------------------------
+// The following formats are used for indirect address modes
+//----------------------------------------------------------------------------
+
+
+IF_DEF(ARD,         IS_AM_RD,                   SPEC)     // read  [adr] (indirect call req. SPEC)
+IF_DEF(AWR,         IS_AM_WR,                   AMD )     // write [adr]
+IF_DEF(ARW,         IS_AM_RW,                   AMD )     // r/w   [adr]
+
+IF_DEF(RRD_ARD,     IS_AM_RD|IS_R1_RD,          AMD )     // read   reg , read [adr]
+IF_DEF(RWR_ARD,     IS_AM_RD|IS_R1_WR,          AMD )     // write  reg , read [adr]
+IF_DEF(RRW_ARD,     IS_AM_RD|IS_R1_RW,          AMD )     // r/w    reg , read [adr]
+IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW,          AMD_CNS)  // r/w    reg , read [adr], const
+
+IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD )     // write  reg , read  reg2, read [adr]
+IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR,          AMD_CNS)  // write  reg , read [adr], const
+IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD)      // write  reg , read [adr], read reg2
+IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write  reg , read  reg2, read [adr], const
+IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write  reg , read  reg2, read [adr], read reg3
+
+IF_DEF(ARD_RRD,     IS_AM_RD|IS_R1_RD,          AMD )     // read  [adr], read  reg
+IF_DEF(AWR_RRD,     IS_AM_WR|IS_R1_RD,          AMD )     // write [adr], read  reg
+IF_DEF(ARW_RRD,     IS_AM_RW|IS_R1_RD,          AMD )     // r/w   [adr], read  reg
+
+IF_DEF(AWR_RRD_RRD, IS_AM_WR|IS_R1_RD|IS_R2_RD, AMD )     // write  [adr], read  reg, read  reg
+
+IF_DEF(ARD_CNS,     IS_AM_RD,                   AMD_CNS)  // read  [adr], const
+IF_DEF(AWR_CNS,     IS_AM_WR,                   AMD_CNS)  // write [adr], const
+IF_DEF(ARW_CNS,     IS_AM_RW,                   AMD_CNS)  // r/w   [adr], const
+
+IF_DEF(AWR_RRD_CNS, IS_AM_WR|IS_R1_RD,          AMD_CNS)  // write [adr], read reg, const
+
+IF_DEF(ARW_SHF,     IS_AM_RW,                   AMD_CNS)  // shift [adr], const
+
+//////////////////////////////////////////////////////////////////////////////
+
+#undef IF_DEF
+
+//////////////////////////////////////////////////////////////////////////////
+#endif // DEFINE_IS_OPS
+#endif // DEFINE_ID_OPS
+//////////////////////////////////////////////////////////////////////////////
+// clang-format on
diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h
index 484eca3399b4..564e1e452b6e 100644
--- a/src/coreclr/jit/emitinl.h
+++ b/src/coreclr/jit/emitinl.h
@@ -101,7 +101,7 @@ inline regNumber emitter::inst3opImulReg(instruction ins)
  *  get stored in different places within the instruction descriptor.
  */
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
 inline ssize_t emitter::emitGetInsAmd(instrDesc* id)
 {
@@ -335,6 +335,50 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id)
 
     id->idReg2((regNumber)encodeMask); // Save in idReg2
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // copy AMD64
+    assert(REGNUM_BITS >= 4);
+    encodeMask = 0;
+
+    if ((regmask & RBM_RSI) != RBM_NONE)
+    {
+        encodeMask |= 0x01;
+    }
+    if ((regmask & RBM_RDI) != RBM_NONE)
+    {
+        encodeMask |= 0x02;
+    }
+    if ((regmask & RBM_RBX) != RBM_NONE)
+    {
+        encodeMask |= 0x04;
+    }
+    if ((regmask & RBM_RBP) != RBM_NONE)
+    {
+        encodeMask |= 0x08;
+    }
+
+    id->idReg1((regNumber)encodeMask); // Save in idReg1
+
+    encodeMask = 0;
+
+    if ((regmask & RBM_R12) != RBM_NONE)
+    {
+        encodeMask |= 0x01;
+    }
+    if ((regmask & RBM_R13) != RBM_NONE)
+    {
+        encodeMask |= 0x02;
+    }
+    if ((regmask & RBM_R14) != RBM_NONE)
+    {
+        encodeMask |= 0x04;
+    }
+    if ((regmask & RBM_R15) != RBM_NONE)
+    {
+        encodeMask |= 0x08;
+    }
+
+    id->idReg2((regNumber)encodeMask); // Save in idReg2
+
 #else
     NYI("unknown target");
 #endif
@@ -446,6 +490,45 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id)
         regmask |= RBM_R27;
     if ((encodeMask & 0x10) != 0)
         regmask |= RBM_R28;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: copy of AMD64
+    assert(REGNUM_BITS >= 4);
+    encodeMask = id->idReg1();
+
+    if ((encodeMask & 0x01) != 0)
+    {
+        regmask |= RBM_RSI;
+    }
+    if ((encodeMask & 0x02) != 0)
+    {
+        regmask |= RBM_RDI;
+    }
+    if ((encodeMask & 0x04) != 0)
+    {
+        regmask |= RBM_RBX;
+    }
+    if ((encodeMask & 0x08) != 0)
+    {
+        regmask |= RBM_RBP;
+    }
+
+    encodeMask = id->idReg2();
+
+    if ((encodeMask & 0x01) != 0)
+    {
+        regmask |= RBM_R12;
+    }
+    if ((encodeMask & 0x02) != 0)
+    {
+        regmask |= RBM_R13;
+    }
+    if ((encodeMask & 0x04) != 0)
+    {
+        regmask |= RBM_R14;
+    }
+    if ((encodeMask & 0x08) != 0)
+    {
+        regmask |= RBM_R15;
+    }
 
 #else
     NYI("unknown target");
diff --git a/src/coreclr/jit/emitjmps.h b/src/coreclr/jit/emitjmps.h
index 4ed340302119..5b37fd152c6d 100644
--- a/src/coreclr/jit/emitjmps.h
+++ b/src/coreclr/jit/emitjmps.h
@@ -46,6 +46,27 @@ JMP_SMALL(lt    , ge    , blt    )  // LT
 JMP_SMALL(gt    , le    , bgt    )  // GT
 JMP_SMALL(le    , gt    , ble    )  // LE
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) //copying AMD64
+
+//       jump   reverse instruction
+JMP_SMALL(jmp, jmp, jmp)
+JMP_SMALL(jo, jno, jo)
+JMP_SMALL(jno, jo, jno)
+JMP_SMALL(jb, jae, jb)
+JMP_SMALL(jae, jb, jae)
+JMP_SMALL(je, jne, je)
+JMP_SMALL(jne, je, jne)
+JMP_SMALL(jbe, ja, jbe)
+JMP_SMALL(ja, jbe, ja)
+JMP_SMALL(js, jns, js)
+JMP_SMALL(jns, js, jns)
+JMP_SMALL(jp, jnp, jp)
+JMP_SMALL(jnp, jp, jnp)
+JMP_SMALL(jl, jge, jl)
+JMP_SMALL(jge, jl, jge)
+JMP_SMALL(jle, jg, jle)
+JMP_SMALL(jg, jle, jg)
+
 #else
   #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/emitwasm.cpp b/src/coreclr/jit/emitwasm.cpp
new file mode 100644
index 000000000000..194af05aa127
--- /dev/null
+++ b/src/coreclr/jit/emitwasm.cpp
@@ -0,0 +1,7217 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                             emitwasm.cpp                                   XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+#include "instr.h"
+#include "emit.h"
+#include "codegen.h"
+
+bool IsSSEInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_SSE_INSTRUCTION);
+}
+
+bool IsSSEOrAVXInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
+}
+
+bool IsAVXOnlyInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
+}
+
+bool IsFMAInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION);
+}
+
+bool IsBMIInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_BMI_INSTRUCTION) && (ins <= INS_LAST_BMI_INSTRUCTION);
+}
+
+regNumber getBmiRegNumber(instruction ins)
+{
+    switch (ins)
+    {
+        case INS_blsi:
+        {
+            return (regNumber)3;
+        }
+
+        case INS_blsmsk:
+        {
+            return (regNumber)2;
+        }
+
+        case INS_blsr:
+        {
+            return (regNumber)1;
+        }
+
+        default:
+        {
+            assert(IsBMIInstruction(ins));
+            return REG_NA;
+        }
+    }
+}
+
+regNumber getSseShiftRegNumber(instruction ins)
+{
+    switch (ins)
+    {
+        case INS_psrldq:
+        {
+            return (regNumber)3;
+        }
+
+        case INS_pslldq:
+        {
+            return (regNumber)7;
+        }
+
+        case INS_psrld:
+        case INS_psrlw:
+        case INS_psrlq:
+        {
+            return (regNumber)2;
+        }
+
+        case INS_pslld:
+        case INS_psllw:
+        case INS_psllq:
+        {
+            return (regNumber)6;
+        }
+
+        case INS_psrad:
+        case INS_psraw:
+        {
+            return (regNumber)4;
+        }
+
+        default:
+        {
+            assert(!"Invalid instruction for SSE2 instruction of the form: opcode reg, immed8");
+            return REG_NA;
+        }
+    }
+}
+
+bool emitter::IsAVXInstruction(instruction ins)
+{
+    return UseVEXEncoding() && IsSSEOrAVXInstruction(ins);
+}
+
+// Returns true if the AVX instruction is a binary operator that requires 3 operands.
+// When we emit an instruction with only two operands, we will duplicate the destination
+// as a source.
+// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to
+// be formalized by adding an additional field to instruction table to
+// to indicate whether a 3-operand instruction.
+bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
+{
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
+}
+
+// Returns true if the AVX instruction requires 3 operands that duplicate the source
+// register in the vvvv field.
+// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to
+// be formalized by adding an additional field to instruction table to
+// to indicate whether a 3-operand instruction.
+bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
+{
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
+}
+
+//------------------------------------------------------------------------
+// AreUpper32BitsZero: check if some previously emitted
+//     instruction set the upper 32 bits of reg to zero.
+//
+// Arguments:
+//    reg - register of interest
+//
+// Return Value:
+//    true if previous instruction zeroed reg's upper 32 bits.
+//    false if it did not, or if we can't safely determine.
+//
+// Notes:
+//    Currently only looks back one instruction.
+//
+//    movsx eax, ... might seem viable but we always encode this
+//    instruction with a 64 bit destination. See TakesRexWPrefix.
+
+bool emitter::AreUpper32BitsZero(regNumber reg)
+{
+    // If there are no instructions in this IG, we can look back at
+    // the previous IG's instructions if this IG is an extension.
+    //
+    if ((emitCurIGinsCnt == 0) && ((emitCurIG->igFlags & IGF_EXTEND) == 0))
+    {
+        return false;
+    }
+
+    instrDesc* id  = emitLastIns;
+    insFormat  fmt = id->idInsFmt();
+
+    // This isn't meant to be a comprehensive check. Just look for what
+    // seems to be common.
+    switch (fmt)
+    {
+        case IF_RWR_CNS:
+        case IF_RRW_CNS:
+        case IF_RRW_SHF:
+        case IF_RWR_RRD:
+        case IF_RRW_RRD:
+        case IF_RWR_MRD:
+        case IF_RWR_SRD:
+        case IF_RWR_ARD:
+
+            // Bail if not writing to the right register
+            if (id->idReg1() != reg)
+            {
+                return false;
+            }
+
+            // Bail if movsx, we always have movsx sign extend to 8 bytes
+            if (id->idIns() == INS_movsx)
+            {
+                return false;
+            }
+
+            // movzx always zeroes the upper 32 bits.
+            if (id->idIns() == INS_movzx)
+            {
+                return true;
+            }
+
+            // Else rely on operation size.
+            return (id->idOpSize() == EA_4BYTE);
+
+        default:
+            break;
+    }
+
+    return false;
+}
+
+//------------------------------------------------------------------------
+// AreFlagsSetToZeroCmp: Checks if the previous instruction set the SZ, and optionally OC, flags to
+//                       the same values as if there were a compare to 0
+//
+// Arguments:
+//    reg - register of interest
+//    opSize - size of register
+//    needsOCFlags - additionally check the overflow and carry flags
+//
+// Return Value:
+//    true if the previous instruction set the flags for reg
+//    false if not, or if we can't safely determine
+//
+// Notes:
+//    Currently only looks back one instruction.
+bool emitter::AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, bool needsOCFlags)
+{
+    assert(reg != REG_NA);
+    // Don't look back across IG boundaries (possible control flow)
+    if (emitCurIGinsCnt == 0 && ((emitCurIG->igFlags & IGF_EXTEND) == 0))
+    {
+        return false;
+    }
+
+    instrDesc* id  = emitLastIns;
+    insFormat  fmt = id->idInsFmt();
+
+    // make sure op1 is a reg
+    switch (fmt)
+    {
+        case IF_RWR_CNS:
+        case IF_RRW_CNS:
+        case IF_RRW_SHF:
+        case IF_RWR_RRD:
+        case IF_RRW_RRD:
+        case IF_RWR_MRD:
+        case IF_RWR_SRD:
+        case IF_RRW_SRD:
+        case IF_RWR_ARD:
+        case IF_RRW_ARD:
+        case IF_RWR:
+        case IF_RRD:
+        case IF_RRW:
+            break;
+
+        default:
+            return false;
+    }
+
+    if (id->idReg1() != reg)
+    {
+        return false;
+    }
+
+    switch (id->idIns())
+    {
+        case INS_adc:
+        case INS_add:
+        case INS_dec:
+        case INS_dec_l:
+        case INS_inc:
+        case INS_inc_l:
+        case INS_neg:
+        case INS_shr_1:
+        case INS_shl_1:
+        case INS_sar_1:
+        case INS_sbb:
+        case INS_sub:
+        case INS_xadd:
+            if (needsOCFlags)
+            {
+                return false;
+            }
+            FALLTHROUGH;
+        // these always set OC to 0
+        case INS_and:
+        case INS_or:
+        case INS_xor:
+            return id->idOpSize() == opSize;
+
+        default:
+            break;
+    }
+
+    return false;
+}
+
+//------------------------------------------------------------------------
+// IsDstSrcImmAvxInstruction: Checks if the instruction has a "reg, reg/mem, imm" or
+//                            "reg/mem, reg, imm" form for the legacy, VEX, and EVEX
+//                            encodings.
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    true if instruction has a "reg, reg/mem, imm" or "reg/mem, reg, imm" encoding
+//    form for the legacy, VEX, and EVEX encodings.
+//
+//    That is, the instruction takes two operands, one of which is immediate, and it
+//    does not need to encode any data in the VEX.vvvv field.
+//
+static bool IsDstSrcImmAvxInstruction(instruction ins)
+{
+    switch (ins)
+    {
+        case INS_aeskeygenassist:
+        case INS_extractps:
+        case INS_pextrb:
+        case INS_pextrw:
+        case INS_pextrd:
+        case INS_pextrq:
+        case INS_pshufd:
+        case INS_pshufhw:
+        case INS_pshuflw:
+        case INS_roundpd:
+        case INS_roundps:
+            return true;
+        default:
+            return false;
+    }
+}
+
+// -------------------------------------------------------------------
+// Is4ByteSSEInstruction: Returns true if the SSE instruction is a 4-byte opcode.
+//
+// Arguments:
+//    ins  -  instruction
+//
+// Note that this should be true for any of the instructions in instrsXArch.h
+// that use the SSE38 or SSE3A macro but returns false if the VEX encoding is
+// in use, since that encoding does not require an additional byte.
+bool emitter::Is4ByteSSEInstruction(instruction ins)
+{
+    return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins);
+}
+
+// Returns true if this instruction requires a VEX prefix
+// All AVX instructions require a VEX prefix
+bool emitter::TakesVexPrefix(instruction ins)
+{
+    // special case vzeroupper as it requires 2-byte VEX prefix
+    // special case the fencing, movnti and the prefetch instructions as they never take a VEX prefix
+    switch (ins)
+    {
+        case INS_lfence:
+        case INS_mfence:
+        case INS_movnti:
+        case INS_prefetchnta:
+        case INS_prefetcht0:
+        case INS_prefetcht1:
+        case INS_prefetcht2:
+        case INS_sfence:
+        case INS_vzeroupper:
+            return false;
+        default:
+            break;
+    }
+
+    return IsAVXInstruction(ins);
+}
+
+// Add base VEX prefix without setting W, R, X, or B bits
+// L bit will be set based on emitter attr.
+//
+// 2-byte VEX prefix = C5 <R,vvvv,L,pp>
+// 3-byte VEX prefix = C4 <R,X,B,m-mmmm> <W,vvvv,L,pp>
+//  - R, X, B, W - bits to express corresponding REX prefixes
+//  - m-mmmmm (5-bit)
+//    0-00001 - implied leading 0F opcode byte
+//    0-00010 - implied leading 0F 38 opcode bytes
+//    0-00011 - implied leading 0F 3A opcode bytes
+//    Rest    - reserved for future use and usage of them will uresult in Undefined instruction exception
+//
+// - vvvv (4-bits) - register specifier in 1's complement form; must be 1111 if unused
+// - L - scalar or AVX-128 bit operations (L=0),  256-bit operations (L=1)
+// - pp (2-bits) - opcode extension providing equivalent functionality of a SIMD size prefix
+//                 these prefixes are treated mandatory when used with escape opcode 0Fh for
+//                 some SIMD instructions
+//   00  - None   (0F    - packed float)
+//   01  - 66     (66 0F - packed double)
+//   10  - F3     (F3 0F - scalar float
+//   11  - F2     (F2 0F - scalar double)
+#define DEFAULT_3BYTE_VEX_PREFIX 0xC4E07800000000ULL
+#define DEFAULT_3BYTE_VEX_PREFIX_MASK 0xFFFFFF00000000ULL
+#define LBIT_IN_3BYTE_VEX_PREFIX 0x00000400000000ULL
+emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr attr)
+{
+    // The 2-byte VEX encoding is preferred when possible, but actually emitting
+    // it depends on a number of factors that we may not know until much later.
+    //
+    // In order to handle this "easily", we just carry the 3-byte encoding all
+    // the way through and "fix-up" the encoding when the VEX prefix is actually
+    // emitted, by simply checking that all the requirements were met.
+
+    // Only AVX instructions require VEX prefix
+    assert(IsAVXInstruction(ins));
+
+    // Shouldn't have already added VEX prefix
+    assert(!hasVexPrefix(code));
+
+    assert((code & DEFAULT_3BYTE_VEX_PREFIX_MASK) == 0);
+
+    code |= DEFAULT_3BYTE_VEX_PREFIX;
+
+    if (attr == EA_32BYTE)
+    {
+        // Set L bit to 1 in case of instructions that operate on 256-bits.
+        code |= LBIT_IN_3BYTE_VEX_PREFIX;
+    }
+
+    return code;
+}
+
+// Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix
+bool TakesRexWPrefix(instruction ins, emitAttr attr)
+{
+    // Because the current implementation of AVX does not have a way to distinguish between the register
+    // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are
+    // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE),
+    // and here we must special case these by the opcode.
+    switch (ins)
+    {
+        case INS_vpermpd:
+        case INS_vpermq:
+        case INS_vpsrlvq:
+        case INS_vpsllvq:
+        case INS_pinsrq:
+        case INS_pextrq:
+        case INS_vfmadd132pd:
+        case INS_vfmadd213pd:
+        case INS_vfmadd231pd:
+        case INS_vfmadd132sd:
+        case INS_vfmadd213sd:
+        case INS_vfmadd231sd:
+        case INS_vfmaddsub132pd:
+        case INS_vfmaddsub213pd:
+        case INS_vfmaddsub231pd:
+        case INS_vfmsubadd132pd:
+        case INS_vfmsubadd213pd:
+        case INS_vfmsubadd231pd:
+        case INS_vfmsub132pd:
+        case INS_vfmsub213pd:
+        case INS_vfmsub231pd:
+        case INS_vfmsub132sd:
+        case INS_vfmsub213sd:
+        case INS_vfmsub231sd:
+        case INS_vfnmadd132pd:
+        case INS_vfnmadd213pd:
+        case INS_vfnmadd231pd:
+        case INS_vfnmadd132sd:
+        case INS_vfnmadd213sd:
+        case INS_vfnmadd231sd:
+        case INS_vfnmsub132pd:
+        case INS_vfnmsub213pd:
+        case INS_vfnmsub231pd:
+        case INS_vfnmsub132sd:
+        case INS_vfnmsub213sd:
+        case INS_vfnmsub231sd:
+        case INS_vpmaskmovq:
+        case INS_vpgatherdq:
+        case INS_vpgatherqq:
+        case INS_vgatherdpd:
+        case INS_vgatherqpd:
+            return true;
+        default:
+            break;
+    }
+
+#ifdef TARGET_AMD64
+    // movsx should always sign extend out to 8 bytes just because we don't track
+    // whether the dest should be 4 bytes or 8 bytes (attr indicates the size
+    // of the source, not the dest).
+    // A 4-byte movzx is equivalent to an 8 byte movzx, so it is not special
+    // cased here.
+    //
+    // Rex_jmp = jmp with rex prefix always requires rex.w prefix.
+    if (ins == INS_movsx || ins == INS_rex_jmp)
+    {
+        return true;
+    }
+
+    if (EA_SIZE(attr) != EA_8BYTE)
+    {
+        return false;
+    }
+
+    if (IsSSEOrAVXInstruction(ins))
+    {
+        switch (ins)
+        {
+            case INS_andn:
+            case INS_bextr:
+            case INS_blsi:
+            case INS_blsmsk:
+            case INS_blsr:
+            case INS_bzhi:
+            case INS_cvttsd2si:
+            case INS_cvttss2si:
+            case INS_cvtsd2si:
+            case INS_cvtss2si:
+            case INS_cvtsi2sd:
+            case INS_cvtsi2ss:
+            case INS_mov_xmm2i:
+            case INS_mov_i2xmm:
+            case INS_movnti:
+            case INS_mulx:
+            case INS_pdep:
+            case INS_pext:
+            case INS_rorx:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    // TODO-XArch-Cleanup: Better way to not emit REX.W when we don't need it, than just testing all these
+    // opcodes...
+    // These are all the instructions that default to 8-byte operand without the REX.W bit
+    // With 1 special case: movzx because the 4 byte version still zeros-out the hi 4 bytes
+    // so we never need it
+    if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) &&
+        (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && !((ins >= INS_i_jmp) && (ins <= INS_l_jg)))
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+#else  //! TARGET_AMD64 = TARGET_X86
+    return false;
+#endif //! TARGET_AMD64
+}
+
+// Returns true if using this register will require a REX.* prefix.
+// Since XMM registers overlap with YMM registers, this routine
+// can also be used to know whether a YMM register if the
+// instruction in question is AVX.
+bool IsExtendedReg(regNumber reg)
+{
+#ifdef TARGET_AMD64
+    return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15));
+#else
+    // X86 JIT operates in 32-bit mode and hence extended reg are not available.
+    return false;
+#endif
+}
+
+// Returns true if using this register, for the given EA_SIZE(attr), will require a REX.* prefix
+bool IsExtendedReg(regNumber reg, emitAttr attr)
+{
+#ifdef TARGET_AMD64
+    // Not a register, so doesn't need a prefix
+    if (reg > REG_XMM15)
+    {
+        return false;
+    }
+
+    // Opcode field only has 3 bits for the register, these high registers
+    // need a 4th bit, that comes from the REX prefix (eiter REX.X, REX.R, or REX.B)
+    if (IsExtendedReg(reg))
+    {
+        return true;
+    }
+
+    if (EA_SIZE(attr) != EA_1BYTE)
+    {
+        return false;
+    }
+
+    // There are 12 one byte registers addressible 'below' r8b:
+    //     al, cl, dl, bl, ah, ch, dh, bh, spl, bpl, sil, dil.
+    // The first 4 are always addressible, the last 8 are divided into 2 sets:
+    //     ah,  ch,  dh,  bh
+    //          -- or --
+    //     spl, bpl, sil, dil
+    // Both sets are encoded exactly the same, the difference is the presence
+    // of a REX prefix, even a REX prefix with no other bits set (0x40).
+    // So in order to get to the second set we need a REX prefix (but no bits).
+    //
+    // TODO-AMD64-CQ: if we ever want to start using the first set, we'll need a different way of
+    // encoding/tracking/encoding registers.
+    return (reg >= REG_RSP);
+#else
+    // X86 JIT operates in 32-bit mode and hence extended reg are not available.
+    return false;
+#endif
+}
+
+// Since XMM registers overlap with YMM registers, this routine
+// can also used to know whether a YMM register in case of AVX instructions.
+bool IsXMMReg(regNumber reg)
+{
+#ifdef TARGET_AMD64
+    return (reg >= REG_XMM0) && (reg <= REG_XMM15);
+#else  // !TARGET_AMD64
+    return (reg >= REG_XMM0) && (reg <= REG_XMM7);
+#endif // !TARGET_AMD64
+}
+
+// Returns bits to be encoded in instruction for the given register.
+unsigned RegEncoding(regNumber reg)
+{
+    static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE");
+    return (unsigned)(reg & 0x7);
+}
+
+// Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes
+// SSE2: separate 1-byte prefix gets added before opcode.
+// AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
+emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
+{
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // W-bit is available only in 3-byte VEX prefix that starts with byte C4.
+            assert(hasVexPrefix(code));
+
+            // W-bit is the only bit that is added in non bit-inverted form.
+            return emitter::code_t(code | 0x00008000000000ULL);
+        }
+    }
+#ifdef TARGET_AMD64
+    return emitter::code_t(code | 0x4800000000ULL);
+#else
+    assert(!"UNREACHED");
+    return code;
+#endif
+}
+
+#ifdef TARGET_AMD64
+
+emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
+{
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // R-bit is supported by both 2-byte and 3-byte VEX prefix
+            assert(hasVexPrefix(code));
+
+            // R-bit is added in bit-inverted form.
+            return code & 0xFF7FFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4400000000ULL;
+}
+
+emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
+{
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // X-bit is available only in 3-byte VEX prefix that starts with byte C4.
+            assert(hasVexPrefix(code));
+
+            // X-bit is added in bit-inverted form.
+            return code & 0xFFBFFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4200000000ULL;
+}
+
+emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
+{
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // B-bit is available only in 3-byte VEX prefix that starts with byte C4.
+            assert(hasVexPrefix(code));
+
+            // B-bit is added in bit-inverted form.
+            return code & 0xFFDFFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4100000000ULL;
+}
+
+// Adds REX prefix (0x40) without W, R, X or B bits set
+emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
+{
+    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
+    return code | 0x4000000000ULL;
+}
+
+#endif // TARGET_AMD64
+
+bool isPrefix(BYTE b)
+{
+    assert(b != 0);    // Caller should check this
+    assert(b != 0x67); // We don't use the address size prefix
+    assert(b != 0x65); // The GS segment override prefix is emitted separately
+    assert(b != 0x64); // The FS segment override prefix is emitted separately
+    assert(b != 0xF0); // The lock prefix is emitted separately
+    assert(b != 0x2E); // We don't use the CS segment override prefix
+    assert(b != 0x3E); // Or the DS segment override prefix
+    assert(b != 0x26); // Or the ES segment override prefix
+    assert(b != 0x36); // Or the SS segment override prefix
+
+    // That just leaves the size prefixes used in SSE opcodes:
+    //      Scalar Double  Scalar Single  Packed Double
+    return ((b == 0xF2) || (b == 0xF3) || (b == 0x66));
+}
+
+// Outputs VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
+unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
+{
+    abort();
+}
+
+#ifdef TARGET_AMD64
+/*****************************************************************************
+ * Is the last instruction emitted a call instruction?
+ */
+bool emitter::emitIsLastInsCall()
+{
+    if ((emitLastIns != nullptr) && (emitLastIns->idIns() == INS_call))
+    {
+        return true;
+    }
+
+    return false;
+}
+
+/*****************************************************************************
+ * We're about to create an epilog. If the last instruction we output was a 'call',
+ * then we need to insert a NOP, to allow for proper exception-handling behavior.
+ */
+void emitter::emitOutputPreEpilogNOP()
+{
+    if (emitIsLastInsCall())
+    {
+        emitIns(INS_nop);
+    }
+}
+
+#endif // TARGET_AMD64
+
+// Size of rex prefix in bytes
+unsigned emitter::emitGetRexPrefixSize(instruction ins)
+{
+    // In case of AVX instructions, REX prefixes are part of VEX prefix.
+    // And hence requires no additional byte to encode REX prefixes.
+    if (IsAVXInstruction(ins))
+    {
+        return 0;
+    }
+
+    // If not AVX, then we would need 1-byte to encode REX prefix.
+    return 1;
+}
+
+// Size of vex prefix in bytes
+unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
+{
+    if (IsAVXInstruction(ins))
+    {
+        return 3;
+    }
+
+    // If not AVX, then we don't need to encode vex prefix.
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
+// configuration.
+//
+// Arguments:
+//    ins   -- The instruction being emitted
+//    attr  -- The emit attribute
+//    code  -- The current opcode and any known prefixes
+unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code)
+{
+    unsigned adjustedSize = 0;
+
+    if (IsAVXInstruction(ins))
+    {
+        // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
+        // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
+        // overstimate.
+        // Instead this routine will adjust the size of VEX prefix based on the number of bytes of opcode it encodes so
+        // that
+        // instruction size estimate will be accurate.
+        // Basically this  will decrease the vexPrefixSize, so that opcodeSize + vexPrefixAdjustedSize will be the right
+        // size.
+        //
+        // rightOpcodeSize + vexPrefixSize
+        //  = (opcodeSize - ExtrabytesSize) + vexPrefixSize
+        //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
+        //  = opcodeSize + vexPrefixAdjustedSize
+
+        unsigned vexPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr);
+        assert(vexPrefixAdjustedSize == 3);
+
+        // In this case, opcode will contains escape prefix at least one byte,
+        // vexPrefixAdjustedSize should be minus one.
+        vexPrefixAdjustedSize -= 1;
+
+        // Get the fourth byte in Opcode.
+        // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not.
+        BYTE check = (code >> 24) & 0xFF;
+        if (check != 0)
+        {
+            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
+            // 4-byte opcode: with the bytes ordered as 0x22114433
+            // Simd prefix is at the first byte.
+            BYTE sizePrefix = (code >> 16) & 0xFF;
+            if (sizePrefix != 0 && isPrefix(sizePrefix))
+            {
+                vexPrefixAdjustedSize -= 1;
+            }
+
+            // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode.
+            // But in this case the opcode has not counted R\M part.
+            // opcodeSize + VexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize
+            //=opcodeSize + VexPrefixAdjustedSize -1 + 1
+            //=opcodeSize + VexPrefixAdjustedSize
+            // So although we may have second byte escape prefix, we won't decrease vexPrefixAdjustedSize.
+        }
+
+        adjustedSize = vexPrefixAdjustedSize;
+    }
+    else if (Is4ByteSSEInstruction(ins))
+    {
+        // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte
+        adjustedSize++;
+    }
+    else
+    {
+        if (ins == INS_crc32)
+        {
+            // Adjust code size for CRC32 that has 4-byte opcode but does not use SSE38 or EES3A encoding.
+            adjustedSize++;
+        }
+
+        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
+        {
+            // Most 16-bit operand instructions will need a 0x66 prefix.
+            adjustedSize++;
+        }
+    }
+
+    return adjustedSize;
+}
+
+// Get size of rex or vex prefix emitted in code
+unsigned emitter::emitGetPrefixSize(code_t code)
+{
+    if (hasVexPrefix(code))
+    {
+        return 3;
+    }
+
+    if (hasRexPrefix(code))
+    {
+        return 1;
+    }
+
+    return 0;
+}
+
+#ifdef TARGET_X86
+/*****************************************************************************
+ *
+ *  Record a non-empty stack
+ */
+
+void emitter::emitMarkStackLvl(unsigned stackLevel)
+{
+    assert(int(stackLevel) >= 0);
+    assert(emitCurStackLvl == 0);
+    assert(emitCurIG->igStkLvl == 0);
+    assert(emitCurIGfreeNext == emitCurIGfreeBase);
+
+    assert(stackLevel && stackLevel % sizeof(int) == 0);
+
+    emitCurStackLvl = emitCurIG->igStkLvl = stackLevel;
+
+    if (emitMaxStackDepth < emitCurStackLvl)
+    {
+        JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
+        emitMaxStackDepth = emitCurStackLvl;
+    }
+}
+#endif
+
+/*****************************************************************************
+ *
+ *  Get hold of the address mode displacement value for an indirect call.
+ */
+
+//inline ssize_t emitter::emitGetInsCIdisp(instrDesc* id)
+//{
+//    if (id->idIsLargeCall())
+//    {
+//        return ((instrDescCGCA*)id)->idcDisp;
+//    }
+//    else
+//    {
+//        assert(!id->idIsLargeDsp());
+//        assert(!id->idIsLargeCns());
+//
+//        return id->idAddr()->iiaAddrMode.amDisp;
+//    }
+//}
+
+/** ***************************************************************************
+ *
+ *  The following table is used by the instIsFP()/instUse/DefFlags() helpers.
+ */
+
+// clang-format off
+const insFlags      CodeGenInterface::instInfo[] =
+{
+    #define INST0(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
+    #define INST1(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
+    #define INST2(id, nm, um, mr, mi,             flags) static_cast<insFlags>(flags),
+    #define INST3(id, nm, um, mr, mi, rm,         flags) static_cast<insFlags>(flags),
+    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) static_cast<insFlags>(flags),
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) static_cast<insFlags>(flags),
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+
+/*****************************************************************************
+ *
+ *  Initialize the table used by emitInsModeFormat().
+ */
+
+// clang-format off
+const BYTE          emitter::emitInsModeFmtTab[] =
+{
+    #define INST0(id, nm, um, mr,                 flags) um,
+    #define INST1(id, nm, um, mr,                 flags) um,
+    #define INST2(id, nm, um, mr, mi,             flags) um,
+    #define INST3(id, nm, um, mr, mi, rm,         flags) um,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) um,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) um,
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+
+#ifdef DEBUG
+unsigned const emitter::emitInsModeFmtCnt = _countof(emitInsModeFmtTab);
+#endif
+
+/*****************************************************************************
+ *
+ *  Combine the given base format with the update mode of the instuction.
+ */
+
+inline emitter::insFormat emitter::emitInsModeFormat(instruction ins, insFormat base)
+{
+    assert(IF_RRD + IUM_RD == IF_RRD);
+    assert(IF_RRD + IUM_WR == IF_RWR);
+    assert(IF_RRD + IUM_RW == IF_RRW);
+
+    return (insFormat)(base + emitInsUpdateMode(ins));
+}
+
+// This is a helper we need due to Vs Whidbey #254016 in order to distinguish
+// if we can not possibly be updating an integer register. This is not the best
+// solution, but the other ones (see bug) are going to be much more complicated.
+bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
+{
+    instruction ins = id->idIns();
+
+    if (!IsSSEOrAVXInstruction(ins))
+    {
+        return false;
+    }
+
+    switch (ins)
+    {
+        case INS_andn:
+        case INS_bextr:
+        case INS_blsi:
+        case INS_blsmsk:
+        case INS_blsr:
+        case INS_bzhi:
+        case INS_cvttsd2si:
+        case INS_cvttss2si:
+        case INS_cvtsd2si:
+        case INS_cvtss2si:
+        case INS_extractps:
+        case INS_mov_xmm2i:
+        case INS_movmskpd:
+        case INS_movmskps:
+        case INS_mulx:
+        case INS_pdep:
+        case INS_pext:
+        case INS_pmovmskb:
+        case INS_pextrb:
+        case INS_pextrd:
+        case INS_pextrq:
+        case INS_pextrw:
+        case INS_pextrw_sse41:
+        case INS_rorx:
+        {
+            // These SSE instructions write to a general purpose integer register.
+            return false;
+        }
+
+        default:
+        {
+            return true;
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Returns the base encoding of the given CPU instruction.
+ */
+
+inline size_t insCode(instruction ins)
+{
+    // clang-format off
+    const static
+    size_t          insCodes[] =
+    {
+        #define INST0(id, nm, um, mr,                 flags) mr,
+        #define INST1(id, nm, um, mr,                 flags) mr,
+        #define INST2(id, nm, um, mr, mi,             flags) mr,
+        #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
+        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
+        #include "instrs.h"
+        #undef  INST0
+        #undef  INST1
+        #undef  INST2
+        #undef  INST3
+        #undef  INST4
+        #undef  INST5
+    };
+    // clang-format on
+
+    assert((unsigned)ins < _countof(insCodes));
+    assert((insCodes[ins] != BAD_CODE));
+
+    return insCodes[ins];
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "AL/AX/EAX, imm" accumulator encoding of the given instruction.
+ */
+
+inline size_t insCodeACC(instruction ins)
+{
+    // clang-format off
+    const static
+    size_t          insCodesACC[] =
+    {
+        #define INST0(id, nm, um, mr,                 flags)
+        #define INST1(id, nm, um, mr,                 flags)
+        #define INST2(id, nm, um, mr, mi,             flags)
+        #define INST3(id, nm, um, mr, mi, rm,         flags)
+        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) a4,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) a4,
+        #include "instrs.h"
+        #undef  INST0
+        #undef  INST1
+        #undef  INST2
+        #undef  INST3
+        #undef  INST4
+        #undef  INST5
+    };
+    // clang-format on
+
+    assert((unsigned)ins < _countof(insCodesACC));
+    assert((insCodesACC[ins] != BAD_CODE));
+
+    return insCodesACC[ins];
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "register" encoding of the given CPU instruction.
+ */
+
+inline size_t insCodeRR(instruction ins)
+{
+    // clang-format off
+    const static
+    size_t          insCodesRR[] =
+    {
+        #define INST0(id, nm, um, mr,                 flags)
+        #define INST1(id, nm, um, mr,                 flags)
+        #define INST2(id, nm, um, mr, mi,             flags)
+        #define INST3(id, nm, um, mr, mi, rm,         flags)
+        #define INST4(id, nm, um, mr, mi, rm, a4,     flags)
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rr,
+        #include "instrs.h"
+        #undef  INST0
+        #undef  INST1
+        #undef  INST2
+        #undef  INST3
+        #undef  INST4
+        #undef  INST5
+    };
+    // clang-format on
+
+    assert((unsigned)ins < _countof(insCodesRR));
+    assert((insCodesRR[ins] != BAD_CODE));
+
+    return insCodesRR[ins];
+}
+
+// clang-format off
+const static
+size_t          insCodesRM[] =
+{
+    #define INST0(id, nm, um, mr,                 flags)
+    #define INST1(id, nm, um, mr,                 flags)
+    #define INST2(id, nm, um, mr, mi,             flags)
+    #define INST3(id, nm, um, mr, mi, rm,         flags) rm,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) rm,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rm,
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+
+// Returns true iff the give CPU instruction has an RM encoding.
+inline bool hasCodeRM(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesRM));
+    return ((insCodesRM[ins] != BAD_CODE));
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "reg, [r/m]" encoding of the given CPU instruction.
+ */
+
+inline size_t insCodeRM(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesRM));
+    assert((insCodesRM[ins] != BAD_CODE));
+
+    return insCodesRM[ins];
+}
+
+// clang-format off
+const static
+size_t          insCodesMI[] =
+{
+    #define INST0(id, nm, um, mr,                 flags)
+    #define INST1(id, nm, um, mr,                 flags)
+    #define INST2(id, nm, um, mr, mi,             flags) mi,
+    #define INST3(id, nm, um, mr, mi, rm,         flags) mi,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mi,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mi,
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+
+// Returns true iff the give CPU instruction has an MI encoding.
+inline bool hasCodeMI(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesMI));
+    return ((insCodesMI[ins] != BAD_CODE));
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "[r/m], 32-bit icon" encoding of the given CPU instruction.
+ */
+
+inline size_t insCodeMI(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesMI));
+    assert((insCodesMI[ins] != BAD_CODE));
+
+    return insCodesMI[ins];
+}
+
+// clang-format off
+const static
+size_t          insCodesMR[] =
+{
+    #define INST0(id, nm, um, mr,                 flags)
+    #define INST1(id, nm, um, mr,                 flags) mr,
+    #define INST2(id, nm, um, mr, mi,             flags) mr,
+    #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
+    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
+    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
+    #include "instrs.h"
+    #undef  INST0
+    #undef  INST1
+    #undef  INST2
+    #undef  INST3
+    #undef  INST4
+    #undef  INST5
+};
+// clang-format on
+
+// Returns true iff the give CPU instruction has an MR encoding.
+inline bool hasCodeMR(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesMR));
+    return ((insCodesMR[ins] != BAD_CODE));
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "[r/m], reg" or "[r/m]" encoding of the given CPU instruction.
+ */
+
+inline size_t insCodeMR(instruction ins)
+{
+    assert((unsigned)ins < _countof(insCodesMR));
+    assert((insCodesMR[ins] != BAD_CODE));
+
+    return insCodesMR[ins];
+}
+
+// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
+bool emitter::EncodedBySSE38orSSE3A(instruction ins)
+{
+    const size_t SSE38 = 0x0F660038;
+    const size_t SSE3A = 0x0F66003A;
+    const size_t MASK  = 0xFFFF00FF;
+
+    size_t insCode = 0;
+
+    if (!IsSSEOrAVXInstruction(ins))
+    {
+        return false;
+    }
+
+    if (hasCodeRM(ins))
+    {
+        insCode = insCodeRM(ins);
+    }
+    else if (hasCodeMI(ins))
+    {
+        insCode = insCodeMI(ins);
+    }
+    else if (hasCodeMR(ins))
+    {
+        insCode = insCodeMR(ins);
+    }
+
+    insCode &= MASK;
+    return insCode == SSE38 || insCode == SSE3A;
+}
+
+/*****************************************************************************
+ *
+ *  Returns an encoding for the specified register to be used in the bit0-2
+ *  part of an opcode.
+ */
+
+inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code)
+{
+    assert(reg < REG_STK);
+
+#ifdef TARGET_AMD64
+    // Either code is not NULL or reg is not an extended reg.
+    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
+    // which would require code != NULL.
+    assert(code != nullptr || !IsExtendedReg(reg));
+
+    if (IsExtendedReg(reg))
+    {
+        *code = AddRexBPrefix(ins, *code); // REX.B
+    }
+    else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr))
+    {
+        // We are assuming that we only use/encode SPL, BPL, SIL and DIL
+        // not the corresponding AH, CH, DH, or BH
+        *code = AddRexPrefix(ins, *code); // REX
+    }
+#endif // TARGET_AMD64
+
+    unsigned regBits = RegEncoding(reg);
+
+    assert(regBits < 8);
+    return regBits;
+}
+
+/*****************************************************************************
+ *
+ *  Returns an encoding for the specified register to be used in the bit3-5
+ *  part of an opcode.
+ */
+
+inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code)
+{
+    assert(reg < REG_STK);
+
+#ifdef TARGET_AMD64
+    // Either code is not NULL or reg is not an extended reg.
+    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
+    // which would require code != NULL.
+    assert(code != nullptr || !IsExtendedReg(reg));
+
+    if (IsExtendedReg(reg))
+    {
+        *code = AddRexRPrefix(ins, *code); // REX.R
+    }
+    else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr))
+    {
+        // We are assuming that we only use/encode SPL, BPL, SIL and DIL
+        // not the corresponding AH, CH, DH, or BH
+        *code = AddRexPrefix(ins, *code); // REX
+    }
+#endif // TARGET_AMD64
+
+    unsigned regBits = RegEncoding(reg);
+
+    assert(regBits < 8);
+    return (regBits << 3);
+}
+
+/***********************************************************************************
+ *
+ *  Returns modified AVX opcode with the specified register encoded in bits 3-6 of
+ *  byte 2 of VEX prefix.
+ */
+inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code)
+{
+    assert(reg < REG_STK);
+    assert(IsAVXInstruction(ins));
+    assert(hasVexPrefix(code));
+
+    // Get 4-bit register encoding
+    // RegEncoding() gives lower 3 bits
+    // IsExtendedReg() gives MSB.
+    code_t regBits = RegEncoding(reg);
+    if (IsExtendedReg(reg))
+    {
+        regBits |= 0x08;
+    }
+
+    // VEX prefix encodes register operand in 1's complement form
+    // Shift count = 4-bytes of opcode + 0-2 bits
+    assert(regBits <= 0xF);
+    regBits <<= 35;
+    return code ^ regBits;
+}
+
+/*****************************************************************************
+ *
+ *  Returns an encoding for the specified register to be used in the bit3-5
+ *  part of an SIB byte (unshifted).
+ *  Used exclusively to generate the REX.X bit and truncate the register.
+ */
+
+inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code)
+{
+    assert(reg < REG_STK);
+
+#ifdef TARGET_AMD64
+    // Either code is not NULL or reg is not an extended reg.
+    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
+    // which would require code != NULL.
+    assert(code != nullptr || reg < REG_R8 || (reg >= REG_XMM0 && reg < REG_XMM8));
+
+    if (IsExtendedReg(reg))
+    {
+        *code = AddRexXPrefix(ins, *code); // REX.X
+    }
+    unsigned regBits = RegEncoding(reg);
+#else  // !TARGET_AMD64
+    unsigned regBits = reg;
+#endif // !TARGET_AMD64
+
+    assert(regBits < 8);
+    return regBits;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "[r/m]" opcode with the mod/RM field set to register.
+ */
+
+inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code)
+{
+    // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes.
+    // Otherwise, it will be placed after the 4 byte encoding.
+    if ((code & 0xFF00) == 0)
+    {
+        assert((code & 0xC000) == 0);
+        code |= 0xC000;
+    }
+
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the given "[r/m]" opcode with the mod/RM field set to register.
+ */
+
+inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code)
+{
+    // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes.
+    // Otherwise, it will be placed after the 4 byte encoding.
+    if ((code & 0xFF00) == 0)
+    {
+        assert((code & 0xC000) == 0);
+        code |= 0xC000;
+    }
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "byte ptr [r/m]" opcode with the mod/RM field set to
+ *  the given register.
+ */
+
+inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code)
+{
+    assert((code & 0xC000) == 0);
+    code |= 0xC000;
+    unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8;
+    code |= regcode;
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "byte ptr [r/m], icon" opcode with the mod/RM field set to
+ *  the given register.
+ */
+
+inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code)
+{
+    assert((code & 0xC000) == 0);
+    code |= 0xC000;
+    unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8;
+    code |= regcode;
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Returns true iff the given instruction does not have a "[r/m], icon" form, but *does* have a
+ *  "reg,reg,imm8" form.
+ */
+inline bool insNeedsRRIb(instruction ins)
+{
+    // If this list gets longer, use a switch or a table.
+    return ins == INS_imul;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "reg,reg,imm8" opcode with both the reg's set to the
+ *  the given register.
+ */
+inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size)
+{
+    assert(size == EA_4BYTE); // All we handle for now.
+    assert(insNeedsRRIb(ins));
+    // If this list gets longer, use a switch, or a table lookup.
+    code_t   code    = 0x69c0;
+    unsigned regcode = insEncodeReg012(ins, reg, size, &code);
+    // We use the same register as source and destination.  (Could have another version that does both regs...)
+    code |= regcode;
+    code |= (regcode << 3);
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Returns the "+reg" opcode with the the given register set into the low
+ *  nibble of the opcode
+ */
+
+inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size)
+{
+    code_t   code    = insCodeRR(ins);
+    unsigned regcode = insEncodeReg012(ins, reg, size, &code);
+    code |= regcode;
+    return code;
+}
+
+/*****************************************************************************
+ *
+ *  Return the 'SS' field value for the given index scale factor.
+ */
+
+inline unsigned emitter::insSSval(unsigned scale)
+{
+    assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
+
+    const static BYTE scales[] = {
+        0x00, // 1
+        0x40, // 2
+        0xFF, // 3
+        0x80, // 4
+        0xFF, // 5
+        0xFF, // 6
+        0xFF, // 7
+        0xC0, // 8
+    };
+
+    return scales[scale - 1];
+}
+
+const instruction emitJumpKindInstructions[] = {INS_nop,
+
+#define JMP_SMALL(en, rev, ins) INS_##ins,
+#include "emitjmps.h"
+
+                                                INS_call};
+
+const emitJumpKind emitReverseJumpKinds[] = {
+    EJ_NONE,
+
+#define JMP_SMALL(en, rev, ins) EJ_##rev,
+#include "emitjmps.h"
+};
+
+/*****************************************************************************
+ * Look up the instruction for a jump kind
+ */
+
+/*static*/ instruction emitter::emitJumpKindToIns(emitJumpKind jumpKind)
+{
+    assert((unsigned)jumpKind < ArrLen(emitJumpKindInstructions));
+    return emitJumpKindInstructions[jumpKind];
+}
+
+/*****************************************************************************
+ * Reverse the conditional jump
+ */
+
+/* static */ emitJumpKind emitter::emitReverseJumpKind(emitJumpKind jumpKind)
+{
+    assert(jumpKind < EJ_COUNT);
+    return emitReverseJumpKinds[jumpKind];
+}
+
+/*****************************************************************************
+ * The size for these instructions is less than EA_4BYTE,
+ * but the target register need not be byte-addressable
+ */
+
+inline bool emitInstHasNoCode(instruction ins)
+{
+    if (ins == INS_align)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+/*****************************************************************************
+ * When encoding instructions that operate on byte registers
+ * we have to ensure that we use a low register (EAX, EBX, ECX or EDX)
+ * otherwise we will incorrectly encode the instruction
+ */
+
+bool emitter::emitVerifyEncodable(instruction ins, emitAttr size, regNumber reg1, regNumber reg2 /* = REG_NA */)
+{
+#if CPU_HAS_BYTE_REGS
+    if (size != EA_1BYTE) // Not operating on a byte register is fine
+    {
+        return true;
+    }
+
+    if ((ins != INS_movsx) && // These three instructions support high register
+        (ins != INS_movzx)    // encodings for reg1
+#ifdef FEATURE_HW_INTRINSICS
+        && (ins != INS_crc32)
+#endif
+            )
+    {
+        // reg1 must be a byte-able register
+        if ((genRegMask(reg1) & RBM_BYTE_REGS) == 0)
+        {
+            return false;
+        }
+    }
+    // if reg2 is not REG_NA then reg2 must be a byte-able register
+    if ((reg2 != REG_NA) && ((genRegMask(reg2) & RBM_BYTE_REGS) == 0))
+    {
+        return false;
+    }
+#endif
+    // The instruction can be encoded
+    return true;
+}
+
+/*****************************************************************************
+ *
+ *  Estimate the size (in bytes of generated code) of the given instruction.
+ */
+
+inline UNATIVE_OFFSET emitter::emitInsSize(code_t code)
+{
+    UNATIVE_OFFSET size = (code & 0xFF000000) ? 4 : (code & 0x00FF0000) ? 3 : 2;
+#ifdef TARGET_AMD64
+    size += emitGetPrefixSize(code);
+#endif
+    return size;
+}
+
+//------------------------------------------------------------------------
+// emitInsSizeRR: Determines the code size for an instruction encoding that does not have any addressing modes
+//
+// Arguments:
+//    ins   -- The instruction being emitted
+//    code  -- The current opcode and any known prefixes
+inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
+{
+    assert(false);
+    //assert(id->idIns() != INS_invalid);
+
+    //instruction ins  = id->idIns();
+    //emitAttr    attr = id->idOpSize();
+
+    //UNATIVE_OFFSET sz = emitInsSize(code);
+
+    //sz += emitGetAdjustedSize(ins, attr, code);
+
+    //// REX prefix
+    //if (TakesRexWPrefix(ins, attr) || IsExtendedReg(id->idReg1(), attr) || IsExtendedReg(id->idReg2(), attr) ||
+    //    (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr))))
+    //{
+    //    sz += emitGetRexPrefixSize(ins);
+    //}
+
+    //return sz;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// emitInsSizeRR: Determines the code size for an instruction encoding that does not have any addressing modes and
+// includes an immediate value
+//
+// Arguments:
+//    ins   -- The instruction being emitted
+//    code  -- The current opcode and any known prefixes
+//    val   -- The immediate value to encode
+inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val)
+{
+    instruction    ins       = id->idIns();
+    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
+    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(valSize <= sizeof(INT32) || !id->idIsCnsReloc());
+#endif // TARGET_AMD64
+
+    if (valSize > sizeof(INT32))
+    {
+        valSize = sizeof(INT32);
+    }
+
+    if (id->idIsCnsReloc())
+    {
+        valInByte = false; // relocs can't be placed in a byte
+        assert(valSize == sizeof(INT32));
+    }
+
+    if (valInByte)
+    {
+        valSize = sizeof(char);
+    }
+    else
+    {
+        assert(!IsSSEOrAVXInstruction(ins));
+    }
+
+    return valSize + emitInsSizeRR(id, code);
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr)
+{
+    emitAttr size = EA_SIZE(attr);
+
+    UNATIVE_OFFSET sz;
+
+    // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes.
+    // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes.
+    // This would probably be better expressed as a different format or something?
+    code_t code = insCodeRM(ins);
+
+    if ((code & 0xFF00) != 0)
+    {
+        sz = IsSSEOrAVXInstruction(ins) ? emitInsSize(code) : 5;
+    }
+    else
+    {
+        sz = emitInsSize(insEncodeRMreg(ins, code));
+    }
+
+    sz += emitGetAdjustedSize(ins, size, insCodeRM(ins));
+
+    // REX prefix
+    if (!hasRexPrefix(code))
+    {
+        if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) ||
+            IsExtendedReg(reg2, attr))
+        {
+            sz += emitGetRexPrefixSize(ins);
+        }
+    }
+
+    return sz;
+}
+
+/*****************************************************************************/
+
+inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp)
+{
+    UNATIVE_OFFSET size = emitInsSize(code);
+    UNATIVE_OFFSET offs;
+    bool           offsIsUpperBound = true;
+    bool           EBPbased         = true;
+
+    /*  Is this a temporary? */
+
+    if (var < 0)
+    {
+        /* An address off of ESP takes an extra byte */
+
+        if (!emitHasFramePtr)
+        {
+            size++;
+        }
+
+        // The offset is already assigned. Find the temp.
+        TempDsc* tmp = codeGen->regSet.tmpFindNum(var, RegSet::TEMP_USAGE_USED);
+        if (tmp == nullptr)
+        {
+            // It might be in the free lists, if we're working on zero initializing the temps.
+            tmp = codeGen->regSet.tmpFindNum(var, RegSet::TEMP_USAGE_FREE);
+        }
+        assert(tmp != nullptr);
+        offs = tmp->tdTempOffs();
+
+        // We only care about the magnitude of the offset here, to determine instruction size.
+        if (emitComp->isFramePointerUsed())
+        {
+            if ((int)offs < 0)
+            {
+                offs = -(int)offs;
+            }
+        }
+        else
+        {
+            // SP-based offsets must already be positive.
+            assert((int)offs >= 0);
+        }
+    }
+    else
+    {
+
+        /* Get the frame offset of the (non-temp) variable */
+
+        offs = dsp + emitComp->lvaFrameAddress(var, &EBPbased);
+
+        /* An address off of ESP takes an extra byte */
+
+        if (!EBPbased)
+        {
+            ++size;
+        }
+
+        /* Is this a stack parameter reference? */
+
+        if ((emitComp->lvaIsParameter(var)
+#if !defined(TARGET_AMD64) || defined(UNIX_AMD64_ABI)
+             && !emitComp->lvaIsRegArgument(var)
+#endif // !TARGET_AMD64 || UNIX_AMD64_ABI
+                 ) ||
+            (static_cast<unsigned>(var) == emitComp->lvaRetAddrVar))
+        {
+            /* If no EBP frame, arguments and ret addr are off of ESP, above temps */
+
+            if (!EBPbased)
+            {
+                assert((int)offs >= 0);
+
+                offsIsUpperBound = false; // since #temps can increase
+                offs += emitMaxTmpSize;
+            }
+        }
+        else
+        {
+            /* Locals off of EBP are at negative offsets */
+
+            if (EBPbased)
+            {
+#if defined(TARGET_AMD64) && !defined(UNIX_AMD64_ABI)
+                // If localloc is not used, then ebp chaining is done and hence
+                // offset of locals will be at negative offsets, Otherwise offsets
+                // will be positive.  In future, when RBP gets positioned in the
+                // middle of the frame so as to optimize instruction encoding size,
+                // the below asserts needs to be modified appropriately.
+                // However, for Unix platforms, we always do frame pointer chaining,
+                // so offsets from the frame pointer will always be negative.
+                if (emitComp->compLocallocUsed || emitComp->opts.compDbgEnC)
+                {
+                    noway_assert((int)offs >= 0);
+                }
+                else
+#endif
+                {
+                    // Dev10 804810 - failing this assert can lead to bad codegen and runtime crashes
+                    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef UNIX_AMD64_ABI
+                    LclVarDsc* varDsc         = emitComp->lvaTable + var;
+                    bool       isRegPassedArg = varDsc->lvIsParam && varDsc->lvIsRegArg;
+                    // Register passed args could have a stack offset of 0.
+                    noway_assert((int)offs < 0 || isRegPassedArg || emitComp->opts.IsOSR());
+#else  // !UNIX_AMD64_ABI
+
+                    // OSR transitioning to RBP frame currently can have mid-frame FP
+                    noway_assert(((int)offs < 0) || emitComp->opts.IsOSR());
+#endif // !UNIX_AMD64_ABI
+                }
+
+                assert(emitComp->lvaTempsHaveLargerOffsetThanVars());
+
+                // lvaInlinedPInvokeFrameVar and lvaStubArgumentVar are placed below the temps
+                if (unsigned(var) == emitComp->lvaInlinedPInvokeFrameVar ||
+                    unsigned(var) == emitComp->lvaStubArgumentVar)
+                {
+                    offs -= emitMaxTmpSize;
+                }
+
+                if ((int)offs < 0)
+                {
+                    // offset is negative
+                    return size + ((int(offs) >= SCHAR_MIN) ? sizeof(char) : sizeof(int));
+                }
+#ifdef TARGET_AMD64
+                // This case arises for localloc frames
+                else
+                {
+                    return size + ((offs <= SCHAR_MAX) ? sizeof(char) : sizeof(int));
+                }
+#endif
+            }
+
+            if (emitComp->lvaTempsHaveLargerOffsetThanVars() == false)
+            {
+                offs += emitMaxTmpSize;
+            }
+        }
+    }
+
+    assert((int)offs >= 0);
+
+#if !FEATURE_FIXED_OUT_ARGS
+
+    /* Are we addressing off of ESP? */
+
+    if (!emitHasFramePtr)
+    {
+        /* Adjust the effective offset if necessary */
+
+        if (emitCntStackDepth)
+            offs += emitCurStackLvl;
+
+        // we could (and used to) check for the special case [sp] here but the stack offset
+        // estimator was off, and there is very little harm in overestimating for such a
+        // rare case.
+    }
+
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+//  printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n",
+//         emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs);
+
+#ifdef TARGET_AMD64
+    bool useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX);
+#else
+    bool useSmallEncoding = (offs <= size_t(SCHAR_MAX));
+#endif
+
+    // If it is ESP based, and the offset is zero, we will not encode the disp part.
+    if (!EBPbased && offs == 0)
+    {
+        return size;
+    }
+    else
+    {
+        return size + (useSmallEncoding ? sizeof(char) : sizeof(int));
+    }
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp)
+{
+    assert(id->idIns() != INS_invalid);
+    instruction    ins      = id->idIns();
+    emitAttr       attrSize = id->idOpSize();
+    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(ins, attrSize, code);
+
+    // REX prefix
+    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
+        IsExtendedReg(id->idReg2(), attrSize))
+    {
+        prefix += emitGetRexPrefixSize(ins);
+    }
+
+    return prefix + emitInsSizeSV(code, var, dsp);
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val)
+{
+    assert(id->idIns() != INS_invalid);
+    instruction    ins       = id->idIns();
+    emitAttr       attrSize  = id->idOpSize();
+    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(attrSize);
+    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(ins, attrSize, code);
+    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(valSize <= sizeof(int) || !id->idIsCnsReloc());
+#endif // TARGET_AMD64
+
+    if (valSize > sizeof(int))
+    {
+        valSize = sizeof(int);
+    }
+
+    if (id->idIsCnsReloc())
+    {
+        valInByte = false; // relocs can't be placed in a byte
+        assert(valSize == sizeof(int));
+    }
+
+    if (valInByte)
+    {
+        valSize = sizeof(char);
+    }
+    else
+    {
+        assert(!IsSSEOrAVXInstruction(ins));
+    }
+
+    // 64-bit operand instructions will need a REX.W prefix
+    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
+        IsExtendedReg(id->idReg2(), attrSize))
+    {
+        prefix += emitGetRexPrefixSize(ins);
+    }
+
+    return prefix + valSize + emitInsSizeSV(code, var, dsp);
+}
+
+/*****************************************************************************/
+
+//static bool baseRegisterRequiresSibByte(regNumber base)
+//{
+//#ifdef TARGET_AMD64
+//    return base == REG_ESP || base == REG_R12;
+//#else
+//    return base == REG_ESP;
+//#endif
+//}
+
+//static bool baseRegisterRequiresDisplacement(regNumber base)
+//{
+//#ifdef TARGET_AMD64
+//    return base == REG_EBP || base == REG_R13;
+//#else
+//    return base == REG_EBP;
+//#endif
+//}
+
+UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
+{
+    assert(false);
+    return 0;
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code, int val)
+{
+    assert(id->idIns() != INS_invalid);
+    instruction    ins       = id->idIns();
+    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
+    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
+
+    // We should never generate BT mem,reg because it has poor performance. BT mem,imm might be useful
+    // but it requires special handling of the immediate value (it is always encoded in a byte).
+    // Let's not complicate things until this is needed.
+    assert(ins != INS_bt);
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(valSize <= sizeof(INT32) || !id->idIsCnsReloc());
+#endif // TARGET_AMD64
+
+    if (valSize > sizeof(INT32))
+    {
+        valSize = sizeof(INT32);
+    }
+
+    if (id->idIsCnsReloc())
+    {
+        valInByte = false; // relocs can't be placed in a byte
+        assert(valSize == sizeof(INT32));
+    }
+
+    if (valInByte)
+    {
+        valSize = sizeof(char);
+    }
+    else
+    {
+        assert(!IsSSEOrAVXInstruction(ins));
+    }
+
+    return valSize + emitInsSizeAM(id, code);
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code)
+{
+    assert(id->idIns() != INS_invalid);
+    instruction ins      = id->idIns();
+    emitAttr    attrSize = id->idOpSize();
+
+    // fgMorph changes any statics that won't fit into 32-bit addresses
+    // into constants with an indir, rather than GT_CLS_VAR
+    // so we should only hit this path for statics that are RIP-relative
+    UNATIVE_OFFSET size = sizeof(INT32);
+
+    size += emitGetAdjustedSize(ins, attrSize, code);
+
+    // 64-bit operand instructions will need a REX.W prefix
+    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
+        IsExtendedReg(id->idReg2(), attrSize))
+    {
+        size += emitGetRexPrefixSize(ins);
+    }
+
+    return size + emitInsSize(code);
+}
+
+inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code, int val)
+{
+    instruction    ins       = id->idIns();
+    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
+    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
+
+#ifndef TARGET_AMD64
+    // occasionally longs get here on x86
+    if (valSize > sizeof(INT32))
+        valSize = sizeof(INT32);
+#endif // !TARGET_AMD64
+
+    if (id->idIsCnsReloc())
+    {
+        valInByte = false; // relocs can't be placed in a byte
+        assert(valSize == sizeof(INT32));
+    }
+
+    if (valInByte)
+    {
+        valSize = sizeof(char);
+    }
+    else
+    {
+        assert(!IsSSEOrAVXInstruction(ins));
+    }
+
+    return valSize + emitInsSizeCV(id, code);
+}
+
+/*****************************************************************************
+ *
+ *  Allocate instruction descriptors for instructions with address modes.
+ */
+
+inline emitter::instrDesc* emitter::emitNewInstrAmd(emitAttr size, ssize_t dsp)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Set the displacement field in an instruction. Only handles instrDescAmd type.
+ */
+
+inline void emitter::emitSetAmdDisp(instrDescAmd* id, ssize_t dsp)
+{
+    if (dsp < AM_DISP_MIN || dsp > AM_DISP_MAX)
+    {
+        id->idSetIsLargeDsp();
+#ifdef DEBUG
+        id->idAddr()->iiaAddrMode.amDisp = AM_DISP_BIG_VAL;
+#endif
+        id->idaAmdVal = dsp;
+    }
+    else
+    {
+        id->idSetIsSmallDsp();
+        id->idAddr()->iiaAddrMode.amDisp = dsp;
+        assert(id->idAddr()->iiaAddrMode.amDisp == dsp); // make sure the value fit
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Allocate an instruction descriptor for an instruction that uses both
+ *  an address mode displacement and a constant.
+ */
+
+emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int cns)
+{
+    assert(false);
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert an alignment instruction here to ensure that
+//  we can properly align the code.
+//
+void emitter::emitLoopAlign(unsigned short paddingBytes)
+{
+    assert(false);
+}
+
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert alignment instruction(s) here to ensure that
+//  we can properly align the code.
+//
+//  This emits more than one `INS_align` instruction depending on the
+//  alignmentBoundary parameter.
+//
+void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
+{
+    assert(false);
+}
+
+/*****************************************************************************
+ *
+ *  Add a NOP instruction of the given size.
+ */
+
+void emitter::emitIns_Nop(unsigned size)
+{
+    assert(size <= MAX_ENCODED_SIZE);
+
+    instrDesc* id = emitNewInstr();
+    id->idIns(INS_nop);
+    id->idInsFmt(IF_NONE);
+    id->idCodeSize(size);
+
+    dispIns(id);
+    emitCurIGsize += size;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with no operands.
+ */
+void emitter::emitIns(instruction ins)
+{
+    assert(false);
+}
+
+// Add an instruction with no operands, but whose encoding depends on the size
+// (Only CDQ/CQO currently)
+void emitter::emitIns(instruction ins, emitAttr attr)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id   = emitNewInstr(attr);
+    code_t         code = insCodeMR(ins);
+    assert(ins == INS_cdq);
+    assert((code & 0xFFFFFF00) == 0);
+    sz = 1;
+
+    insFormat fmt = IF_NONE;
+
+    sz += emitGetAdjustedSize(ins, attr, code);
+    if (TakesRexWPrefix(ins, attr))
+    {
+        sz += emitGetRexPrefixSize(ins);
+    }
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitMapFmtForIns: map the instruction format based on the instruction.
+// Shift-by-a-constant instructions have a special format.
+//
+// Arguments:
+//    fmt - the instruction format to map
+//    ins - the instruction
+//
+// Returns:
+//    The mapped instruction format.
+//
+emitter::insFormat emitter::emitMapFmtForIns(insFormat fmt, instruction ins)
+{
+    switch (ins)
+    {
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+        {
+            switch (fmt)
+            {
+                case IF_RRW_CNS:
+                    return IF_RRW_SHF;
+                case IF_MRW_CNS:
+                    return IF_MRW_SHF;
+                case IF_SRW_CNS:
+                    return IF_SRW_SHF;
+                case IF_ARW_CNS:
+                    return IF_ARW_SHF;
+                default:
+                    unreached();
+            }
+        }
+
+        default:
+            return fmt;
+    }
+}
+
+//------------------------------------------------------------------------
+// emitMapFmtAtoM: map the address mode formats ARD, ARW, and AWR to their direct address equivalents.
+//
+// Arguments:
+//    fmt - the instruction format to map
+//
+// Returns:
+//    The mapped instruction format.
+//
+emitter::insFormat emitter::emitMapFmtAtoM(insFormat fmt)
+{
+    switch (fmt)
+    {
+        case IF_ARD:
+            return IF_MRD;
+        case IF_AWR:
+            return IF_MWR;
+        case IF_ARW:
+            return IF_MRW;
+
+        case IF_RRD_ARD:
+            return IF_RRD_MRD;
+        case IF_RWR_ARD:
+            return IF_RWR_MRD;
+        case IF_RWR_ARD_CNS:
+            return IF_RWR_MRD_CNS;
+        case IF_RRW_ARD:
+            return IF_RRW_MRD;
+        case IF_RRW_ARD_CNS:
+            return IF_RRW_MRD_CNS;
+        case IF_RWR_RRD_ARD:
+            return IF_RWR_RRD_MRD;
+        case IF_RWR_RRD_ARD_CNS:
+            return IF_RWR_RRD_MRD_CNS;
+        case IF_RWR_RRD_ARD_RRD:
+            return IF_RWR_RRD_MRD_RRD;
+
+        case IF_ARD_RRD:
+            return IF_MRD_RRD;
+        case IF_AWR_RRD:
+            return IF_MWR_RRD;
+        case IF_ARW_RRD:
+            return IF_MRW_RRD;
+
+        case IF_ARD_CNS:
+            return IF_MRD_CNS;
+        case IF_AWR_CNS:
+            return IF_MWR_CNS;
+        case IF_ARW_CNS:
+            return IF_MRW_CNS;
+
+        case IF_AWR_RRD_CNS:
+            return IF_MWR_RRD_CNS;
+
+        case IF_ARW_SHF:
+            return IF_MRW_SHF;
+
+        default:
+            unreached();
+    }
+}
+
+//------------------------------------------------------------------------
+// emitHandleMemOp: For a memory operand, fill in the relevant fields of the instrDesc.
+//
+// Arguments:
+//    indir - the memory operand.
+//    id - the instrDesc to fill in.
+//    fmt - the instruction format to use. This must be one of the ARD, AWR, or ARW formats. If necessary (such as for
+//          GT_CLS_VAR_ADDR), this function will map it to the correct format.
+//    ins - the instruction we are generating. This might affect the instruction format we choose.
+//
+// Assumptions:
+//    The correctly sized instrDesc must already be created, e.g., via emitNewInstrAmd() or emitNewInstrAmdCns();
+//
+// Post-conditions:
+//    For base address of int constant:
+//        -- the caller must have added the int constant base to the instrDesc when creating it via
+//           emitNewInstrAmdCns().
+//    For simple address modes (base + scale * index + offset):
+//        -- the base register, index register, and scale factor are set.
+//        -- the caller must have added the addressing mode offset int constant to the instrDesc when creating it via
+//           emitNewInstrAmdCns().
+//
+//    The instruction format is set.
+//
+//    idSetIsDspReloc() is called if necessary.
+//
+void emitter::emitHandleMemOp(GenTreeIndir* indir, instrDesc* id, insFormat fmt, instruction ins)
+{
+    assert(false);
+}
+
+// Takes care of storing all incoming register parameters
+// into its corresponding shadow space (defined by the x64 ABI)
+void emitter::spillIntArgRegsToShadowSlots()
+{
+    unsigned       argNum;
+    instrDesc*     id;
+    UNATIVE_OFFSET sz;
+
+    assert(emitComp->compGeneratingProlog);
+
+    for (argNum = 0; argNum < MAX_REG_ARG; ++argNum)
+    {
+        regNumber argReg = intArgRegs[argNum];
+
+        // The offsets for the shadow space start at RSP + 8
+        // (right before the caller return address)
+        int offset = (argNum + 1) * EA_PTRSIZE;
+
+        id = emitNewInstrAmd(EA_PTRSIZE, offset);
+        id->idIns(INS_mov);
+        id->idInsFmt(IF_AWR_RRD);
+        id->idAddr()->iiaAddrMode.amBaseReg = REG_SPBASE;
+        id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+        id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(1);
+
+        // The offset has already been set in the intrDsc ctor,
+        // make sure we got it right.
+        assert(emitGetInsAmdAny(id) == ssize_t(offset));
+
+        id->idReg1(argReg);
+        sz = emitInsSizeAM(id, insCodeMR(INS_mov));
+        id->idCodeSize(sz);
+        emitCurIGsize += sz;
+    }
+}
+
+//------------------------------------------------------------------------
+// emitInsLoadInd: Emits a "mov reg, [mem]" (or a variant such as "movzx" or "movss")
+// instruction for a GT_IND node.
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    dstReg - the destination register
+//    mem - the GT_IND node
+//
+void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem)
+{
+    assert(mem->OperIs(GT_IND, GT_NULLCHECK));
+
+    GenTree* addr = mem->Addr();
+
+    if (addr->OperGet() == GT_CLS_VAR_ADDR)
+    {
+        emitIns_R_C(ins, attr, dstReg, addr->AsClsVar()->gtClsVarHnd, 0);
+        return;
+    }
+
+    if (addr->OperIs(GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR))
+    {
+        GenTreeLclVarCommon* varNode = addr->AsLclVarCommon();
+        unsigned             offset  = varNode->GetLclOffs();
+        emitIns_R_S(ins, attr, dstReg, varNode->GetLclNum(), offset);
+
+        // Updating variable liveness after instruction was emitted.
+        // TODO-Review: it appears that this call to genUpdateLife does nothing because it
+        // returns quickly when passed GT_LCL_VAR_ADDR or GT_LCL_FLD_ADDR. Below, emitInsStoreInd
+        // had similar code that replaced `varNode` with `mem` (to fix a GC hole). It might be
+        // appropriate to do that here as well, but doing so showed no asm diffs, so it's not
+        // clear when this scenario gets hit, at least for GC refs.
+        codeGen->genUpdateLife(varNode);
+        return;
+    }
+
+    assert(addr->OperIsAddrMode() || (addr->IsCnsIntOrI() && addr->isContained()) || !addr->isContained());
+    ssize_t    offset = mem->Offset();
+    instrDesc* id     = emitNewInstrAmd(attr, offset);
+    id->idIns(ins);
+    id->idReg1(dstReg);
+    emitHandleMemOp(mem, id, IF_RWR_ARD, ins);
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitInsStoreInd: Emits a "mov [mem], reg/imm" (or a variant such as "movss")
+// instruction for a GT_STOREIND node.
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    mem - the GT_STOREIND node
+//
+void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem)
+{
+    assert(mem->OperIs(GT_STOREIND));
+
+    GenTree* addr = mem->Addr();
+    GenTree* data = mem->Data();
+
+    if (addr->OperGet() == GT_CLS_VAR_ADDR)
+    {
+        if (data->isContainedIntOrIImmed())
+        {
+            emitIns_C_I(ins, attr, addr->AsClsVar()->gtClsVarHnd, 0, (int)data->AsIntConCommon()->IconValue());
+        }
+        else
+        {
+            assert(!data->isContained());
+            emitIns_C_R(ins, attr, addr->AsClsVar()->gtClsVarHnd, data->GetRegNum(), 0);
+        }
+        return;
+    }
+
+    if (addr->OperIs(GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR))
+    {
+        GenTreeLclVarCommon* varNode = addr->AsLclVarCommon();
+        unsigned             offset  = varNode->GetLclOffs();
+        if (data->isContainedIntOrIImmed())
+        {
+            emitIns_S_I(ins, attr, varNode->GetLclNum(), offset, (int)data->AsIntConCommon()->IconValue());
+        }
+        else
+        {
+            assert(!data->isContained());
+            emitIns_S_R(ins, attr, data->GetRegNum(), varNode->GetLclNum(), offset);
+        }
+
+        // Updating variable liveness after instruction was emitted
+        codeGen->genUpdateLife(mem);
+        return;
+    }
+
+    ssize_t        offset = mem->Offset();
+    UNATIVE_OFFSET sz;
+    instrDesc*     id;
+
+    if (data->isContainedIntOrIImmed())
+    {
+        int icon = (int)data->AsIntConCommon()->IconValue();
+        id       = emitNewInstrAmdCns(attr, offset, icon);
+        id->idIns(ins);
+        emitHandleMemOp(mem, id, IF_AWR_CNS, ins);
+        sz = emitInsSizeAM(id, insCodeMI(ins), icon);
+        id->idCodeSize(sz);
+    }
+    else
+    {
+        assert(!data->isContained());
+        id = emitNewInstrAmd(attr, offset);
+        id->idIns(ins);
+        emitHandleMemOp(mem, id, IF_AWR_RRD, ins);
+        id->idReg1(data->GetRegNum());
+        sz = emitInsSizeAM(id, insCodeMR(ins));
+        id->idCodeSize(sz);
+    }
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitInsStoreLcl: Emits a "mov [mem], reg/imm" (or a variant such as "movss")
+// instruction for a GT_STORE_LCL_VAR node.
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    varNode - the GT_STORE_LCL_VAR node
+//
+void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommon* varNode)
+{
+    assert(varNode->OperIs(GT_STORE_LCL_VAR));
+    assert(varNode->GetRegNum() == REG_NA); // stack store
+
+    GenTree* data = varNode->gtGetOp1();
+    codeGen->inst_set_SV_var(varNode);
+
+    if (data->isContainedIntOrIImmed())
+    {
+        emitIns_S_I(ins, attr, varNode->GetLclNum(), 0, (int)data->AsIntConCommon()->IconValue());
+    }
+    else
+    {
+        assert(!data->isContained());
+        emitIns_S_R(ins, attr, data->GetRegNum(), varNode->GetLclNum(), 0);
+    }
+
+    // Updating variable liveness after instruction was emitted
+    codeGen->genUpdateLife(varNode);
+}
+
+//------------------------------------------------------------------------
+// emitInsBinary: Emits an instruction for a node which takes two operands
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    dst - the destination and first source operand
+//    src - the second source operand
+//
+// Assumptions:
+//  i) caller of this routine needs to call genConsumeReg()
+// ii) caller of this routine needs to call genProduceReg()
+regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src)
+{
+    assert(false);
+    return (regNumber)0;
+}
+
+//------------------------------------------------------------------------
+// emitInsRMW: Emit logic for Read-Modify-Write binary instructions.
+//
+// Responsible for emitting a single instruction that will perform an operation of the form:
+//      *addr = *addr <BinOp> src
+// For example:
+//      ADD [RAX], RCX
+//
+// Arguments:
+//    ins - instruction to generate
+//    attr - emitter attribute for instruction
+//    storeInd - indir for RMW addressing mode
+//    src - source operand of instruction
+//
+// Assumptions:
+//    Lowering has taken care of recognizing the StoreInd pattern of:
+//          StoreInd( AddressTree, BinOp( Ind ( AddressTree ), Operand ) )
+//    The address to store is already sitting in a register.
+//
+// Notes:
+//    This is a no-produce operation, meaning that no register output will
+//    be produced for future use in the code stream.
+//
+void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeInd, GenTree* src)
+{
+    GenTree* addr = storeInd->Addr();
+    addr          = addr->gtSkipReloadOrCopy();
+    assert(addr->OperIs(GT_LCL_VAR, GT_LCL_VAR_ADDR, GT_LEA, GT_CLS_VAR_ADDR, GT_CNS_INT));
+
+    instrDesc*     id = nullptr;
+    UNATIVE_OFFSET sz;
+
+    ssize_t offset = 0;
+    if (addr->OperGet() != GT_CLS_VAR_ADDR)
+    {
+        offset = storeInd->Offset();
+    }
+
+    if (src->isContainedIntOrIImmed())
+    {
+        GenTreeIntConCommon* intConst = src->AsIntConCommon();
+        int                  iconVal  = (int)intConst->IconValue();
+        switch (ins)
+        {
+            case INS_rcl_N:
+            case INS_rcr_N:
+            case INS_rol_N:
+            case INS_ror_N:
+            case INS_shl_N:
+            case INS_shr_N:
+            case INS_sar_N:
+                iconVal &= 0x7F;
+                break;
+            default:
+                break;
+        }
+
+        id = emitNewInstrAmdCns(attr, offset, iconVal);
+        emitHandleMemOp(storeInd, id, IF_ARW_CNS, ins);
+        id->idIns(ins);
+        sz = emitInsSizeAM(id, insCodeMI(ins), iconVal);
+    }
+    else
+    {
+        assert(!src->isContained()); // there must be one non-contained src
+
+        // ind, reg
+        id = emitNewInstrAmd(attr, offset);
+        emitHandleMemOp(storeInd, id, IF_ARW_RRD, ins);
+        id->idReg1(src->GetRegNum());
+        id->idIns(ins);
+        sz = emitInsSizeAM(id, insCodeMR(ins));
+    }
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitInsRMW: Emit logic for Read-Modify-Write unary instructions.
+//
+// Responsible for emitting a single instruction that will perform an operation of the form:
+//      *addr = UnaryOp *addr
+// For example:
+//      NOT [RAX]
+//
+// Arguments:
+//    ins - instruction to generate
+//    attr - emitter attribute for instruction
+//    storeInd - indir for RMW addressing mode
+//
+// Assumptions:
+//    Lowering has taken care of recognizing the StoreInd pattern of:
+//          StoreInd( AddressTree, UnaryOp( Ind ( AddressTree ) ) )
+//    The address to store is already sitting in a register.
+//
+// Notes:
+//    This is a no-produce operation, meaning that no register output will
+//    be produced for future use in the code stream.
+//
+void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeInd)
+{
+    GenTree* addr = storeInd->Addr();
+    addr          = addr->gtSkipReloadOrCopy();
+    assert(addr->OperIs(GT_LCL_VAR, GT_LCL_VAR_ADDR, GT_CLS_VAR_ADDR, GT_LEA, GT_CNS_INT));
+
+    ssize_t offset = 0;
+    if (addr->OperGet() != GT_CLS_VAR_ADDR)
+    {
+        offset = storeInd->Offset();
+    }
+
+    instrDesc* id = emitNewInstrAmd(attr, offset);
+    emitHandleMemOp(storeInd, id, IF_ARW, ins);
+    id->idIns(ins);
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction referencing a single register.
+ */
+
+void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
+{
+    emitAttr size = EA_SIZE(attr);
+
+    assert(size <= EA_PTRSIZE);
+    noway_assert(emitVerifyEncodable(ins, size, reg));
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrSmall(attr);
+
+    switch (ins)
+    {
+        case INS_inc:
+        case INS_dec:
+#ifdef TARGET_AMD64
+
+            sz = 2; // x64 has no 1-byte opcode (it is the same encoding as the REX prefix)
+
+#else // !TARGET_AMD64
+
+            if (size == EA_1BYTE)
+                sz = 2; // Use the long form as the small one has no 'w' bit
+            else
+                sz = 1; // Use short form
+
+#endif // !TARGET_AMD64
+
+            break;
+
+        case INS_pop:
+        case INS_pop_hide:
+        case INS_push:
+        case INS_push_hide:
+
+            /* We don't currently push/pop small values */
+
+            assert(size == EA_PTRSIZE);
+
+            sz = 1;
+            break;
+
+        default:
+
+            /* All the sixteen INS_setCCs are contiguous. */
+
+            if (INS_seto <= ins && ins <= INS_setg)
+            {
+                // Rough check that we used the endpoints for the range check
+
+                assert(INS_seto + 0xF == INS_setg);
+
+                // The caller must specify EA_1BYTE for 'attr'
+
+                assert(attr == EA_1BYTE);
+
+                /* We expect this to always be a 'big' opcode */
+
+                assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000);
+
+                size = attr;
+
+                sz = 3;
+                break;
+            }
+            else
+            {
+                sz = 2;
+                break;
+            }
+    }
+    insFormat fmt = emitInsModeFormat(ins, IF_RRD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(reg);
+
+    // Vex bytes
+    sz += emitGetAdjustedSize(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
+
+    // REX byte
+    if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr))
+    {
+        sz += emitGetRexPrefixSize(ins);
+    }
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction referencing a register and a constant.
+ */
+
+void emitter::emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, ssize_t val)
+{
+    assert(false);
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction referencing an integer constant.
+ */
+
+void emitter::emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id;
+    bool           valInByte = ((signed char)val == (target_ssize_t)val);
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    if (EA_IS_CNS_RELOC(attr))
+    {
+        valInByte = false; // relocs can't be placed in a byte
+    }
+
+    switch (ins)
+    {
+        case INS_loop:
+        case INS_jge:
+            sz = 2;
+            break;
+
+        case INS_ret:
+            sz = 3;
+            break;
+
+        case INS_push_hide:
+        case INS_push:
+            sz = valInByte ? 2 : 5;
+            break;
+
+        default:
+            NO_WAY("unexpected instruction");
+    }
+
+    id = emitNewInstrSC(attr, val);
+    id->idIns(ins);
+    id->idInsFmt(IF_CNS);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+/*****************************************************************************
+ *
+ *  Add a "jump through a table" instruction.
+ */
+
+void emitter::emitIns_IJ(emitAttr attr, regNumber reg, unsigned base)
+{
+    assert(EA_SIZE(attr) == EA_4BYTE);
+
+    UNATIVE_OFFSET    sz  = 3 + 4;
+    const instruction ins = INS_i_jmp;
+
+    if (IsExtendedReg(reg, attr))
+    {
+        sz += emitGetRexPrefixSize(ins);
+    }
+
+    instrDesc* id = emitNewInstrAmd(attr, base);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_ARD);
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = reg;
+    id->idAddr()->iiaAddrMode.amScale   = emitter::OPSZP;
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idMemCookie = base;
+#endif
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with a static data member operand. If 'size' is 0, the
+ *  instruction operates on the address of the static member instead of its
+ *  value (e.g. "push offset clsvar", rather than "push dword ptr [clsvar]").
+ */
+
+void emitter::emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, int offs)
+{
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id;
+
+    /* Are we pushing the offset of the class variable? */
+
+    if (EA_IS_OFFSET(attr))
+    {
+        assert(ins == INS_push);
+        sz = 1 + TARGET_POINTER_SIZE;
+
+        id = emitNewInstrDsp(EA_1BYTE, offs);
+        id->idIns(ins);
+        id->idInsFmt(IF_MRD_OFF);
+    }
+    else
+    {
+        insFormat fmt = emitInsModeFormat(ins, IF_MRD);
+
+        id = emitNewInstrDsp(attr, offs);
+        id->idIns(ins);
+        id->idInsFmt(fmt);
+        sz = emitInsSizeCV(id, insCodeMR(ins));
+    }
+
+    if (TakesRexWPrefix(ins, attr))
+    {
+        // REX.W prefix
+        sz += emitGetRexPrefixSize(ins);
+    }
+
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with two register operands.
+ */
+
+void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2)
+{
+    emitAttr size = EA_SIZE(attr);
+
+    /* We don't want to generate any useless mov instructions! */
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef TARGET_AMD64
+    // Same-reg 4-byte mov can be useful because it performs a
+    // zero-extension to 8 bytes.
+    assert(ins != INS_mov || reg1 != reg2 || size == EA_4BYTE);
+#else
+    assert(ins != INS_mov || reg1 != reg2);
+#endif // TARGET_AMD64
+
+    assert(size <= EA_32BYTE);
+    noway_assert(emitVerifyEncodable(ins, size, reg1, reg2));
+
+    UNATIVE_OFFSET sz = emitInsSizeRR(ins, reg1, reg2, attr);
+
+    /* Special case: "XCHG" uses a different format */
+    insFormat fmt = (ins == INS_xchg) ? IF_RRW_RRW : emitInsModeFormat(ins, IF_RRD_RRD);
+
+    instrDesc* id = emitNewInstrSmall(attr);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with two register operands and an integer constant.
+ */
+
+void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival)
+{
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    instrDesc* id = emitNewInstrSC(attr, ival);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RRW_RRW_CNS);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    code_t code = 0;
+
+    switch (ins)
+    {
+        case INS_pextrb:
+        case INS_pextrd:
+        case INS_pextrq:
+        case INS_pextrw_sse41:
+        case INS_extractps:
+        case INS_vextractf128:
+        case INS_vextracti128:
+        case INS_shld:
+        case INS_shrd:
+        {
+            code = insCodeMR(ins);
+            break;
+        }
+
+        case INS_psrldq:
+        case INS_pslldq:
+        {
+            code = insCodeMI(ins);
+            break;
+        }
+
+        default:
+        {
+            code = insCodeRM(ins);
+            break;
+        }
+    }
+
+    UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs)
+{
+    assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta);
+
+    instrDesc* id = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+
+    id->idInsFmt(IF_ARD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitIns_AR_R_R: emits the code for an instruction that takes a base memory register, two register operands
+//                 and that does not return a value
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op2Reg    -- The register of the second operand
+//    op3Reg    -- The register of the third operand
+//    base      -- The base register used for the memory address (first operand)
+//    offs      -- The offset from base
+//
+void emitter::emitIns_AR_R_R(
+    instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(op2Reg);
+    id->idReg2(op3Reg);
+
+    id->idInsFmt(IF_AWR_RRD_RRD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir)
+{
+    ssize_t    offs = indir->Offset();
+    instrDesc* id   = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+
+    emitHandleMemOp(indir, id, IF_RRW_ARD, ins);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival)
+{
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+    assert(IsSSEOrAVXInstruction(ins));
+
+    ssize_t    offs = indir->Offset();
+    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+
+    emitHandleMemOp(indir, id, IF_RRW_ARD_CNS, ins);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival)
+{
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+    assert(IsSSEOrAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+
+    id->idInsFmt(IF_RRW_ARD_CNS);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_C_I(
+    instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
+{
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+    assert(IsSSEOrAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RRW_MRD_CNS);
+    id->idReg1(reg1);
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival)
+{
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+    assert(IsSSEOrAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrCns(attr, ival);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RRW_SRD_CNS);
+    id->idReg1(reg1);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstr(attr);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_SRD);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    ssize_t    offs = indir->Offset();
+    instrDesc* id   = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    emitHandleMemOp(indir, id, IF_RWR_RRD_ARD, ins);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    id->idInsFmt(IF_RWR_RRD_ARD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather
+//
+// Arguments:
+//    ins - the instruction to check
+// Return Value:
+//    true if the instruction is AVX2 Gather
+//
+bool IsAVX2GatherInstruction(instruction ins)
+{
+    switch (ins)
+    {
+        case INS_vpgatherdd:
+        case INS_vpgatherdq:
+        case INS_vpgatherqd:
+        case INS_vpgatherqq:
+        case INS_vgatherdps:
+        case INS_vgatherdpd:
+        case INS_vgatherqps:
+        case INS_vgatherqpd:
+            return true;
+        default:
+            return false;
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_AR_R: Emits an AVX2 Gather instructions
+//
+// Arguments:
+//    ins - the instruction to emit
+//    attr - the instruction operand size
+//    reg1 - the destination and first source operand
+//    reg2 - the mask operand (encoded in VEX.vvvv)
+//    base - the base register of address to load
+//    index - the index register of VSIB
+//    scale - the scale number of VSIB
+//    offs - the offset added to the memory address from base
+//
+void emitter::emitIns_R_AR_R(instruction ins,
+                             emitAttr    attr,
+                             regNumber   reg1,
+                             regNumber   reg2,
+                             regNumber   base,
+                             regNumber   index,
+                             int         scale,
+                             int         offs)
+{
+    assert(IsAVX2GatherInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmd(attr, offs);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    id->idInsFmt(IF_RWR_ARD_RRD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = index;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeSize((emitAttr)scale);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_C(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    instrDesc* id = emitNewInstrDsp(attr, offs);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_MRD);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+*
+*  Add an instruction with three register operands.
+*/
+
+void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2)
+{
+    assert(false);
+}
+
+void emitter::emitIns_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+
+    id->idInsFmt(IF_RWR_RRD_ARD_CNS);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_C_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_MRD_CNS);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/**********************************************************************************
+* emitIns_R_R_R_I: Add an instruction with three register operands and an immediate.
+*
+* Arguments:
+*    ins       - the instruction to add
+*    attr      - the emitter attribute for instruction
+*    targetReg - the target (destination) register
+*    reg1      - the first source register
+*    reg2      - the second source register
+*    ival      - the immediate value
+*/
+
+void emitter::emitIns_R_R_R_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, int ival)
+{
+    assert(false);
+}
+
+void emitter::emitIns_R_R_S_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival)
+{
+    assert(IsSSEOrAVXInstruction(ins));
+    assert(IsThreeOperandAVXInstruction(ins));
+
+    instrDesc* id = emitNewInstrCns(attr, ival);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_RWR_RRD_SRD_CNS);
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// encodeXmmRegAsIval: Encodes a XMM register into imm[7:4] for use by a SIMD instruction
+//
+// Arguments
+//    opReg -- The register being encoded
+//
+// Returns:
+//    opReg encoded in imm[7:4]
+static int encodeXmmRegAsIval(regNumber opReg)
+{
+    // AVX/AVX2 supports 4-reg format for vblendvps/vblendvpd/vpblendvb,
+    // which encodes the fourth register into imm8[7:4]
+    assert(opReg >= XMMBASE);
+    int ival = (opReg - XMMBASE) << 4;
+
+    assert((ival >= 0) && (ival <= 255));
+    return (int8_t)ival;
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_R_A_R: emits the code for an instruction that takes a register operand, a GenTreeIndir address,
+//                  another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    indir     -- The GenTreeIndir used for the memory address
+//
+// Remarks:
+//    op2 is built from indir
+//
+void emitter::emitIns_R_R_A_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir)
+{
+    assert(isAvxBlendv(ins));
+    assert(UseVEXEncoding());
+
+    int        ival = encodeXmmRegAsIval(op3Reg);
+    ssize_t    offs = indir->Offset();
+    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(targetReg);
+    id->idReg2(op1Reg);
+
+    emitHandleMemOp(indir, id, IF_RWR_RRD_ARD_RRD, ins);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_R_AR_R: emits the code for an instruction that takes a register operand, a base memory
+//                   register, another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operands
+//    op3Reg    -- The register of the third operand
+//    base      -- The base register used for the memory address
+//    offs      -- The offset added to the memory address from base
+//
+// Remarks:
+//    op2 is built from base + offs
+//
+void emitter::emitIns_R_R_AR_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs)
+{
+    assert(isAvxBlendv(ins));
+    assert(UseVEXEncoding());
+
+    int        ival = encodeXmmRegAsIval(op3Reg);
+    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
+
+    id->idIns(ins);
+    id->idReg1(targetReg);
+    id->idReg2(op1Reg);
+
+    id->idInsFmt(IF_RWR_RRD_ARD_RRD);
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_R_C_R: emits the code for an instruction that takes a register operand, a field handle +
+//                  offset,  another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
+//    offs      -- The offset added to the memory address from fldHnd
+//
+// Remarks:
+//    op2 is built from fldHnd + offs
+//
+void emitter::emitIns_R_R_C_R(instruction          ins,
+                              emitAttr             attr,
+                              regNumber            targetReg,
+                              regNumber            op1Reg,
+                              regNumber            op3Reg,
+                              CORINFO_FIELD_HANDLE fldHnd,
+                              int                  offs)
+{
+    assert(isAvxBlendv(ins));
+    assert(UseVEXEncoding());
+
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    int        ival = encodeXmmRegAsIval(op3Reg);
+    instrDesc* id   = emitNewInstrCnsDsp(attr, ival, offs);
+
+    id->idIns(ins);
+    id->idReg1(targetReg);
+    id->idReg2(op1Reg);
+
+    id->idInsFmt(IF_RWR_RRD_MRD_RRD);
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+//------------------------------------------------------------------------
+// emitIns_R_R_R_S: emits the code for a instruction that takes a register operand, a variable index +
+//                  offset, another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    varx      -- The variable index used for the memory address
+//    offs      -- The offset added to the memory address from varx
+//
+// Remarks:
+//    op2 is built from varx + offs
+//
+void emitter::emitIns_R_R_S_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs)
+{
+    assert(isAvxBlendv(ins));
+    assert(UseVEXEncoding());
+
+    int        ival = encodeXmmRegAsIval(op3Reg);
+    instrDesc* id   = emitNewInstrCns(attr, ival);
+
+    id->idIns(ins);
+    id->idReg1(targetReg);
+    id->idReg2(op1Reg);
+
+    id->idInsFmt(IF_RWR_RRD_SRD_RRD);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
+{
+    assert(false);
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with a register + static member operands.
+ */
+void emitter::emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO_FIELD_HANDLE fldHnd, int offs)
+{
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    emitAttr size = EA_SIZE(attr);
+
+    assert(size <= EA_32BYTE);
+    noway_assert(emitVerifyEncodable(ins, size, reg));
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id;
+
+    // Are we MOV'ing the offset of the class variable into EAX?
+    if (EA_IS_OFFSET(attr))
+    {
+        id = emitNewInstrDsp(EA_1BYTE, offs);
+        id->idIns(ins);
+        id->idInsFmt(IF_RWR_MRD_OFF);
+        id->idReg1(reg);
+
+        assert(ins == INS_mov && reg == REG_EAX);
+
+        // Special case: "mov eax, [addr]" is smaller
+        sz = 1 + TARGET_POINTER_SIZE;
+    }
+    else
+    {
+        insFormat fmt = emitInsModeFormat(ins, IF_RRD_MRD);
+
+        id = emitNewInstrDsp(attr, offs);
+        id->idIns(ins);
+        id->idInsFmt(fmt);
+        id->idReg1(reg);
+
+#ifdef TARGET_X86
+        // Special case: "mov eax, [addr]" is smaller.
+        // This case is not enabled for amd64 as it always uses RIP relative addressing
+        // and it results in smaller instruction size than encoding 64-bit addr in the
+        // instruction.
+        if (ins == INS_mov && reg == REG_EAX)
+        {
+            sz = 1 + TARGET_POINTER_SIZE;
+            if (size == EA_2BYTE)
+                sz += 1;
+        }
+        else
+#endif // TARGET_X86
+        {
+            sz = emitInsSizeCV(id, insCodeRM(ins));
+        }
+
+        // Special case: mov reg, fs:[ddd]
+        if (fldHnd == FLD_GLOBAL_FS)
+        {
+            sz += 1;
+        }
+    }
+
+    id->idCodeSize(sz);
+
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with a static member + register operands.
+ */
+
+void emitter::emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, regNumber reg, int offs)
+{
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    emitAttr size = EA_SIZE(attr);
+
+#if defined(TARGET_X86)
+    // For x86 it is valid to storeind a double sized operand in an xmm reg to memory
+    assert(size <= EA_8BYTE);
+#else
+    assert(size <= EA_PTRSIZE);
+#endif
+
+    noway_assert(emitVerifyEncodable(ins, size, reg));
+
+    instrDesc* id  = emitNewInstrDsp(attr, offs);
+    insFormat  fmt = emitInsModeFormat(ins, IF_MRD_RRD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(reg);
+
+    UNATIVE_OFFSET sz;
+
+#ifdef TARGET_X86
+    // Special case: "mov [addr], EAX" is smaller.
+    // This case is not enable for amd64 as it always uses RIP relative addressing
+    // and it will result in smaller instruction size than encoding 64-bit addr in
+    // the instruction.
+    if (ins == INS_mov && reg == REG_EAX)
+    {
+        sz = 1 + TARGET_POINTER_SIZE;
+
+        if (size == EA_2BYTE)
+            sz += 1;
+
+        // REX prefix
+        if (TakesRexWPrefix(ins, attr) || IsExtendedReg(reg, attr))
+        {
+            sz += emitGetRexPrefixSize(ins);
+        }
+    }
+    else
+#endif // TARGET_X86
+    {
+        sz = emitInsSizeCV(id, insCodeMR(ins));
+    }
+
+    // Special case: mov reg, fs:[ddd]
+    if (fldHnd == FLD_GLOBAL_FS)
+    {
+        sz += 1;
+    }
+
+    id->idCodeSize(sz);
+
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add an instruction with a static member + constant.
+ */
+
+void emitter::emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, int offs, int val)
+{
+    // Static always need relocs
+    if (!jitStaticFldIsGlobAddr(fldHnd))
+    {
+        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
+    }
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_MRW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_MRD_CNS);
+            break;
+    }
+
+    instrDesc* id = emitNewInstrCnsDsp(attr, val, offs);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idAddr()->iiaFieldHnd = fldHnd;
+
+    code_t         code = insCodeMI(ins);
+    UNATIVE_OFFSET sz   = emitInsSizeCV(id, code, val);
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_J_S(instruction ins, emitAttr attr, BasicBlock* dst, int varx, int offs)
+{
+    assert(ins == INS_mov);
+    assert(dst->bbFlags & BBF_JMP_TARGET);
+
+    instrDescLbl* id = emitNewInstrLbl();
+
+    id->idIns(ins);
+    id->idInsFmt(IF_SWR_LABEL);
+    id->idAddr()->iiaBBlabel = dst;
+
+    /* The label reference is always long */
+
+    id->idjShort    = 0;
+    id->idjKeepLong = 1;
+
+    /* Record the current IG and offset within it */
+
+    id->idjIG   = emitCurIG;
+    id->idjOffs = emitCurIGsize;
+
+    /* Append this instruction to this IG's jump list */
+
+    id->idjNext      = emitCurIGjmpList;
+    emitCurIGjmpList = id;
+
+    UNATIVE_OFFSET sz = sizeof(INT32) + emitInsSizeSV(id, insCodeMI(ins), varx, offs);
+    id->dstLclVar.initLclVarAddr(varx, offs);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+#if EMITTER_STATS
+    emitTotalIGjmps++;
+#endif
+
+#ifndef TARGET_AMD64
+    // Storing the address of a basicBlock will need a reloc
+    // as the instruction uses the absolute address,
+    // not a relative address.
+    //
+    // On Amd64, Absolute code addresses should always go through a reloc to
+    // to be encoded as RIP rel32 offset.
+    if (emitComp->opts.compReloc)
+#endif
+    {
+        id->idSetIsDspReloc();
+    }
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Add a label instruction.
+ */
+void emitter::emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNumber reg)
+{
+    assert(ins == INS_lea);
+    assert(dst->bbFlags & BBF_JMP_TARGET);
+
+    instrDescJmp* id = emitNewInstrJmp();
+
+    id->idIns(ins);
+    id->idReg1(reg);
+    id->idInsFmt(IF_RWR_LABEL);
+    id->idOpSize(EA_SIZE(attr)); // emitNewInstrJmp() sets the size (incorrectly) to EA_1BYTE
+    id->idAddr()->iiaBBlabel = dst;
+
+    /* The label reference is always long */
+
+    id->idjShort    = 0;
+    id->idjKeepLong = 1;
+
+    /* Record the current IG and offset within it */
+
+    id->idjIG   = emitCurIG;
+    id->idjOffs = emitCurIGsize;
+
+    /* Append this instruction to this IG's jump list */
+
+    id->idjNext      = emitCurIGjmpList;
+    emitCurIGjmpList = id;
+
+#ifdef DEBUG
+    // Mark the catch return
+    if (emitComp->compCurBB->bbJumpKind == BBJ_EHCATCHRET)
+    {
+        id->idDebugOnlyInfo()->idCatchRet = true;
+    }
+#endif // DEBUG
+
+#if EMITTER_STATS
+    emitTotalIGjmps++;
+#endif
+
+    // Set the relocation flags - these give hint to zap to perform
+    // relocation of the specified 32bit address.
+    //
+    // Note the relocation flags influence the size estimate.
+    id->idSetRelocFlags(attr);
+
+    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  The following adds instructions referencing address modes.
+ */
+
+void emitter::emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_ARW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
+            break;
+    }
+
+    /*
+    Useful if you want to trap moves with 0 constant
+    if (ins == INS_mov && val == 0 && EA_SIZE(attr) >= EA_4BYTE)
+    {
+        printf("MOV 0\n");
+    }
+    */
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = reg;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMI(ins), val);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_I_AI(instruction ins, emitAttr attr, int val, ssize_t disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_ARW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
+            break;
+    }
+
+    /*
+    Useful if you want to trap moves with 0 constant
+    if (ins == INS_mov && val == 0 && EA_SIZE(attr) >= EA_4BYTE)
+    {
+        printf("MOV 0\n");
+    }
+    */
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMI(ins), val);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber base, int disp)
+{
+    emitIns_R_ARX(ins, attr, reg, base, REG_NA, 1, disp);
+}
+
+void emitter::emitIns_R_AI(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE) && (ireg != REG_NA));
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstrAmd(attr, disp);
+    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(ireg);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp)
+{
+    emitIns_ARX_R(ins, attr, reg, base, REG_NA, 1, disp);
+}
+
+//------------------------------------------------------------------------
+// emitIns_S_R_I: emits the code for an instruction that takes a stack operand,
+//                a register operand, and an immediate.
+//
+// Arguments:
+//    ins       - The instruction being emitted
+//    attr      - The emit attribute
+//    varNum    - The varNum of the stack operand
+//    offs      - The offset for the stack operand
+//    reg       - The register operand
+//    ival      - The immediate value
+//
+void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival)
+{
+    // This is only used for INS_vextracti128 and INS_vextractf128, and for these 'ival' must be 0 or 1.
+    assert(ins == INS_vextracti128 || ins == INS_vextractf128);
+    assert((ival == 0) || (ival == 1));
+    instrDesc* id = emitNewInstrAmdCns(attr, 0, ival);
+
+    id->idIns(ins);
+    id->idInsFmt(IF_SWR_RRD_CNS);
+    id->idReg1(reg);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varNum, offs);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeMR(ins), varNum, offs, ival);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm)
+{
+    assert((ins == INS_vextracti128) || (ins == INS_vextractf128));
+    assert(attr == EA_32BYTE);
+    assert(reg != REG_NA);
+
+    instrDesc* id = emitNewInstrAmdCns(attr, indir->Offset(), imm);
+    id->idIns(ins);
+    id->idReg1(reg);
+    emitHandleMemOp(indir, id, IF_AWR_RRD_CNS, ins);
+    UNATIVE_OFFSET size = emitInsSizeAM(id, insCodeMR(ins), imm);
+    id->idCodeSize(size);
+    dispIns(id);
+    emitCurIGsize += size;
+}
+
+void emitter::emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmd(attr, disp);
+    insFormat      fmt;
+
+    if (ireg == REG_NA)
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD);
+    }
+    else
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
+
+        assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
+
+        id->idReg1(ireg);
+    }
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+void emitter::emitIns_I_ARR(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, int disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_ARW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
+            break;
+    }
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = reg;
+    id->idAddr()->iiaAddrMode.amIndxReg = rg2;
+    id->idAddr()->iiaAddrMode.amScale   = emitter::OPSZ1;
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMI(ins), val);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_ARR(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp)
+{
+    emitIns_R_ARX(ins, attr, reg, base, index, 1, disp);
+}
+
+void emitter::emitIns_ARR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp)
+{
+    emitIns_ARX_R(ins, attr, reg, base, index, 1, disp);
+}
+
+void emitter::emitIns_I_ARX(
+    instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, unsigned mul, int disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_ARW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
+            break;
+    }
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = reg;
+    id->idAddr()->iiaAddrMode.amIndxReg = rg2;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMI(ins), val);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_ARX(
+    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp)
+{
+    assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE) && (reg != REG_NA));
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));
+
+    if ((ins == INS_lea) && (reg == base) && (index == REG_NA) && (disp == 0))
+    {
+        // Maybe the emitter is not the common place for this optimization, but it's a better choke point
+        // for all the emitIns(ins, tree), we would have to be analyzing at each call site
+        //
+        return;
+    }
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstrAmd(attr, disp);
+    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(reg);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = index;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(scale);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_ARX_R(
+    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, cnsval_ssize_t disp)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmd(attr, disp);
+    insFormat      fmt;
+
+    if (reg == REG_NA)
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD);
+    }
+    else
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
+
+        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));
+        assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE));
+
+        id->idReg1(reg);
+    }
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = base;
+    id->idAddr()->iiaAddrMode.amIndxReg = index;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(scale);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+void emitter::emitIns_I_AX(instruction ins, emitAttr attr, int val, regNumber reg, unsigned mul, int disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_ARW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
+            break;
+    }
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = reg;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMI(ins), val);
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp)
+{
+    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE) && (ireg != REG_NA));
+    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstrAmd(attr, disp);
+    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(ireg);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = reg;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeRM(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id = emitNewInstrAmd(attr, disp);
+    insFormat      fmt;
+
+    if (ireg == REG_NA)
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD);
+    }
+    else
+    {
+        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
+        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
+        assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
+
+        id->idReg1(ireg);
+    }
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
+    id->idAddr()->iiaAddrMode.amIndxReg = reg;
+    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
+
+    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
+
+    sz = emitInsSizeAM(id, insCodeMR(ins));
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_I: emits the code for an instruction that takes a register operand, an immediate operand
+//                     and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    ival      -- The immediate value
+//
+// Notes:
+//    This will handle the required register copy if 'op1Reg' and 'targetReg' are not the same, and
+//    the 3-operand format is not available.
+//    This is not really SIMD-specific, but is currently only used in that context, as that's
+//    where we frequently need to handle the case of generating 3-operand or 2-operand forms
+//    depending on what target ISA is supported.
+//
+void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival)
+{
+    if (UseVEXEncoding() || IsDstSrcImmAvxInstruction(ins))
+    {
+        emitIns_R_R_I(ins, attr, targetReg, op1Reg, ival);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_I(ins, attr, targetReg, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_A: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
+//                     and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    indir     -- The GenTreeIndir used for the memory address
+//
+void emitter::emitIns_SIMD_R_R_A(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_A(ins, attr, targetReg, indir);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_AR: emits the code for a SIMD instruction that takes a register operand, a base memory register,
+//                      and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    base      -- The base register used for the memory address
+//    offset    -- The memory offset
+//
+void emitter::emitIns_SIMD_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_AR(ins, attr, targetReg, base, offset);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_C: emits the code for a SIMD instruction that takes a register operand, a field handle + offset,
+//                     and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
+//    offs      -- The offset added to the memory address from fldHnd
+//
+void emitter::emitIns_SIMD_R_R_C(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_C(ins, attr, targetReg, fldHnd, offs);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R: emits the code for a SIMD instruction that takes two register operands, and that returns a
+//                     value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//
+void emitter::emitIns_SIMD_R_R_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op2
+            assert(op2Reg != targetReg);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_R(ins, attr, targetReg, op2Reg);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_S: emits the code for a SIMD instruction that takes a register operand, a variable index + offset,
+//                     and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    varx      -- The variable index used for the memory address
+//    offs      -- The offset added to the memory address from varx
+//
+void emitter::emitIns_SIMD_R_R_S(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_S(ins, attr, targetReg, varx, offs);
+    }
+}
+
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_A_I: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
+//                       an immediate operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    indir     -- The GenTreeIndir used for the memory address
+//    ival      -- The immediate value
+//
+void emitter::emitIns_SIMD_R_R_A_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_A_I(ins, attr, targetReg, op1Reg, indir, ival, IF_RWR_RRD_ARD_CNS);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_A_I(ins, attr, targetReg, indir, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_AR_I: emits the code for a SIMD instruction that takes a register operand, a base memory register,
+//                        an immediate operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    base      -- The base register used for the memory address
+//    ival      -- The immediate value
+//
+void emitter::emitIns_SIMD_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_AR_I(ins, attr, targetReg, op1Reg, base, 0, ival);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_AR_I(ins, attr, targetReg, base, 0, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_C_I: emits the code for a SIMD instruction that takes a register operand, a field handle + offset,
+//                       an immediate operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
+//    offs      -- The offset added to the memory address from fldHnd
+//    ival      -- The immediate value
+//
+void emitter::emitIns_SIMD_R_R_C_I(instruction          ins,
+                                   emitAttr             attr,
+                                   regNumber            targetReg,
+                                   regNumber            op1Reg,
+                                   CORINFO_FIELD_HANDLE fldHnd,
+                                   int                  offs,
+                                   int                  ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_C_I(ins, attr, targetReg, op1Reg, fldHnd, offs, ival);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_C_I(ins, attr, targetReg, fldHnd, offs, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_I: emits the code for a SIMD instruction that takes two register operands, an immediate operand,
+//                       and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//    ival      -- The immediate value
+//
+void emitter::emitIns_SIMD_R_R_R_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, ival);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op2
+            assert(op2Reg != targetReg);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_R_I(ins, attr, targetReg, op2Reg, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_S_I: emits the code for a SIMD instruction that takes a register operand, a variable index + offset,
+//                       an imediate operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    varx      -- The variable index used for the memory address
+//    offs      -- The offset added to the memory address from varx
+//    ival      -- The immediate value
+//
+void emitter::emitIns_SIMD_R_R_S_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, int ival)
+{
+    if (UseVEXEncoding())
+    {
+        emitIns_R_R_S_I(ins, attr, targetReg, op1Reg, varx, offs, ival);
+    }
+    else
+    {
+        if (op1Reg != targetReg)
+        {
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_S_I(ins, attr, targetReg, varx, offs, ival);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_A: emits the code for a SIMD instruction that takes two register operands, a GenTreeIndir address,
+//                       and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//    indir     -- The GenTreeIndir used for the memory address
+//
+void emitter::emitIns_SIMD_R_R_R_A(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir)
+{
+    assert(IsFMAInstruction(ins));
+    assert(UseVEXEncoding());
+
+    if (op1Reg != targetReg)
+    {
+        // Ensure we aren't overwriting op2
+        assert(op2Reg != targetReg);
+
+        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+    }
+
+    emitIns_R_R_A(ins, attr, targetReg, op2Reg, indir);
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_AR: emits the code for a SIMD instruction that takes two register operands, a base memory
+//                        register, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operands
+//    op2Reg    -- The register of the second operand
+//    base      -- The base register used for the memory address
+//
+void emitter::emitIns_SIMD_R_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base)
+{
+    assert(IsFMAInstruction(ins));
+    assert(UseVEXEncoding());
+
+    if (op1Reg != targetReg)
+    {
+        // Ensure we aren't overwriting op2
+        assert(op2Reg != targetReg);
+
+        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+    }
+
+    emitIns_R_R_AR(ins, attr, targetReg, op2Reg, base, 0);
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_C: emits the code for a SIMD instruction that takes two register operands, a field handle +
+//                       offset, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
+//    offs      -- The offset added to the memory address from fldHnd
+//
+void emitter::emitIns_SIMD_R_R_R_C(instruction          ins,
+                                   emitAttr             attr,
+                                   regNumber            targetReg,
+                                   regNumber            op1Reg,
+                                   regNumber            op2Reg,
+                                   CORINFO_FIELD_HANDLE fldHnd,
+                                   int                  offs)
+{
+    assert(IsFMAInstruction(ins));
+    assert(UseVEXEncoding());
+
+    if (op1Reg != targetReg)
+    {
+        // Ensure we aren't overwriting op2
+        assert(op2Reg != targetReg);
+
+        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+    }
+
+    emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs);
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_R: emits the code for a SIMD instruction that takes three register operands, and that returns a
+//                     value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//    op3Reg    -- The register of the second operand
+//
+void emitter::emitIns_SIMD_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg)
+{
+    if (IsFMAInstruction(ins))
+    {
+        assert(UseVEXEncoding());
+
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op2 or op3
+
+            assert(op2Reg != targetReg);
+            assert(op3Reg != targetReg);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+
+        emitIns_R_R_R(ins, attr, targetReg, op2Reg, op3Reg);
+    }
+    else if (UseVEXEncoding())
+    {
+        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+                ins = INS_vblendvps;
+                break;
+            case INS_blendvpd:
+                ins = INS_vblendvpd;
+                break;
+            case INS_pblendvb:
+                ins = INS_vpblendvb;
+                break;
+            default:
+                break;
+        }
+        emitIns_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3Reg);
+    }
+    else
+    {
+        assert(isSse41Blendv(ins));
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (op3Reg != REG_XMM0)
+        {
+            // Ensure we aren't overwriting op1 or op2
+            assert(op1Reg != REG_XMM0);
+            assert(op2Reg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
+        }
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op2 or oop3 (which should be REG_XMM0)
+            assert(op2Reg != targetReg);
+            assert(targetReg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+        emitIns_R_R(ins, attr, targetReg, op2Reg);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_R_S: emits the code for a SIMD instruction that takes two register operands, a variable index +
+//                       offset, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op2Reg    -- The register of the second operand
+//    varx      -- The variable index used for the memory address
+//    offs      -- The offset added to the memory address from varx
+//
+void emitter::emitIns_SIMD_R_R_R_S(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs)
+{
+    assert(IsFMAInstruction(ins));
+    assert(UseVEXEncoding());
+
+    if (op1Reg != targetReg)
+    {
+        // Ensure we aren't overwriting op2
+        assert(op2Reg != targetReg);
+
+        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+    }
+
+    emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs);
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_A_R: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
+//                       another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    indir     -- The GenTreeIndir used for the memory address
+//
+void emitter::emitIns_SIMD_R_R_A_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir)
+{
+    if (UseVEXEncoding())
+    {
+        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+            {
+                ins = INS_vblendvps;
+                break;
+            }
+
+            case INS_blendvpd:
+            {
+                ins = INS_vblendvpd;
+                break;
+            }
+
+            case INS_pblendvb:
+            {
+                ins = INS_vpblendvb;
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        emitIns_R_R_A_R(ins, attr, targetReg, op1Reg, op3Reg, indir);
+    }
+    else
+    {
+        assert(isSse41Blendv(ins));
+
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (op3Reg != REG_XMM0)
+        {
+            // Ensure we aren't overwriting op1
+            assert(op1Reg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
+        }
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
+            assert(targetReg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+
+        emitIns_R_A(ins, attr, targetReg, indir);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_AR_R: emits the code for a SIMD instruction that takes a register operand, a base memory
+//                        register, another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operands
+//    op3Reg    -- The register of the third operand
+//    base      -- The base register used for the memory address
+//
+void emitter::emitIns_SIMD_R_R_AR_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base)
+{
+    if (UseVEXEncoding())
+    {
+        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+            {
+                ins = INS_vblendvps;
+                break;
+            }
+
+            case INS_blendvpd:
+            {
+                ins = INS_vblendvpd;
+                break;
+            }
+
+            case INS_pblendvb:
+            {
+                ins = INS_vpblendvb;
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        emitIns_R_R_AR_R(ins, attr, targetReg, op1Reg, op3Reg, base, 0);
+    }
+    else
+    {
+        assert(isSse41Blendv(ins));
+
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (op3Reg != REG_XMM0)
+        {
+            // Ensure we aren't overwriting op1
+            assert(op1Reg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
+        }
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
+            assert(targetReg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+
+        emitIns_R_AR(ins, attr, targetReg, base, 0);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_C_R: emits the code for a SIMD instruction that takes a register operand, a field handle +
+//                       offset,  another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
+//    offs      -- The offset added to the memory address from fldHnd
+//
+void emitter::emitIns_SIMD_R_R_C_R(instruction          ins,
+                                   emitAttr             attr,
+                                   regNumber            targetReg,
+                                   regNumber            op1Reg,
+                                   regNumber            op3Reg,
+                                   CORINFO_FIELD_HANDLE fldHnd,
+                                   int                  offs)
+{
+    if (UseVEXEncoding())
+    {
+        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+            {
+                ins = INS_vblendvps;
+                break;
+            }
+
+            case INS_blendvpd:
+            {
+                ins = INS_vblendvpd;
+                break;
+            }
+
+            case INS_pblendvb:
+            {
+                ins = INS_vpblendvb;
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        emitIns_R_R_C_R(ins, attr, targetReg, op1Reg, op3Reg, fldHnd, offs);
+    }
+    else
+    {
+        assert(isSse41Blendv(ins));
+
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (op3Reg != REG_XMM0)
+        {
+            // Ensure we aren't overwriting op1
+            assert(op1Reg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
+        }
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
+            assert(targetReg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+
+        emitIns_R_C(ins, attr, targetReg, fldHnd, offs);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitIns_SIMD_R_R_S_R: emits the code for a SIMD instruction that takes a register operand, a variable index +
+//                       offset, another register operand, and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    targetReg -- The target register
+//    op1Reg    -- The register of the first operand
+//    op3Reg    -- The register of the third operand
+//    varx      -- The variable index used for the memory address
+//    offs      -- The offset added to the memory address from varx
+//
+void emitter::emitIns_SIMD_R_R_S_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs)
+{
+    if (UseVEXEncoding())
+    {
+        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
+
+        // convert SSE encoding of SSE4.1 instructions to VEX encoding
+        switch (ins)
+        {
+            case INS_blendvps:
+            {
+                ins = INS_vblendvps;
+                break;
+            }
+
+            case INS_blendvpd:
+            {
+                ins = INS_vblendvpd;
+                break;
+            }
+
+            case INS_pblendvb:
+            {
+                ins = INS_vpblendvb;
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        emitIns_R_R_S_R(ins, attr, targetReg, op1Reg, op3Reg, varx, offs);
+    }
+    else
+    {
+        assert(isSse41Blendv(ins));
+
+        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+        if (op3Reg != REG_XMM0)
+        {
+            // Ensure we aren't overwriting op1
+            assert(op1Reg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
+        }
+        if (op1Reg != targetReg)
+        {
+            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
+            assert(targetReg != REG_XMM0);
+
+            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
+        }
+
+        emitIns_R_S(ins, attr, targetReg, varx, offs);
+    }
+}
+#endif // FEATURE_HW_INTRINSICS
+
+/*****************************************************************************
+ *
+ *  The following add instructions referencing stack-based local variables.
+ */
+
+void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstr(attr);
+    insFormat      fmt = emitInsModeFormat(ins, IF_SRD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+    sz = emitInsSizeSV(id, insCodeMR(ins), varx, offs);
+    id->idCodeSize(sz);
+
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+    dispIns(id);
+    emitCurIGsize += sz;
+
+    emitAdjustStackDepthPushPop(ins);
+}
+
+void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs)
+{
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstr(attr);
+    insFormat      fmt = emitInsModeFormat(ins, IF_SRD_RRD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(ireg);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+    sz = emitInsSizeSV(id, insCodeMR(ins), varx, offs);
+
+#ifdef TARGET_X86
+    if (attr == EA_1BYTE)
+    {
+        assert(isByteReg(ireg));
+    }
+#endif
+
+    id->idCodeSize(sz);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs)
+{
+    emitAttr size = EA_SIZE(attr);
+    noway_assert(emitVerifyEncodable(ins, size, ireg));
+
+    UNATIVE_OFFSET sz;
+    instrDesc*     id  = emitNewInstr(attr);
+    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_SRD);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idReg1(ireg);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+    sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
+    id->idCodeSize(sz);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+void emitter::emitIns_S_I(instruction ins, emitAttr attr, int varx, int offs, int val)
+{
+#ifdef TARGET_AMD64
+    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
+    // all other opcodes take a sign-extended 4-byte immediate
+    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
+#endif
+
+    insFormat fmt;
+
+    switch (ins)
+    {
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            assert(val != 1);
+            fmt = IF_SRW_SHF;
+            val &= 0x7F;
+            break;
+
+        default:
+            fmt = emitInsModeFormat(ins, IF_SRD_CNS);
+            break;
+    }
+
+    instrDesc* id = emitNewInstrCns(attr, val);
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
+
+    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeMI(ins), varx, offs, val);
+    id->idCodeSize(sz);
+#ifdef DEBUG
+    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
+#endif
+    dispIns(id);
+    emitCurIGsize += sz;
+}
+
+/*****************************************************************************
+ *
+ *  Record that a jump instruction uses the short encoding
+ *
+ */
+void emitter::emitSetShortJump(instrDescJmp* id)
+{
+    if (id->idjKeepLong)
+    {
+        return;
+    }
+
+    id->idjShort = true;
+}
+
+/*****************************************************************************
+ *
+ *  Add a jmp instruction.
+ *  When dst is NULL, instrCount specifies number of instructions
+ *       to jump: positive is forward, negative is backward.
+ */
+
+void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 */)
+{
+    assert(false);
+
+}
+
+#if !FEATURE_FIXED_OUT_ARGS
+
+//------------------------------------------------------------------------
+// emitAdjustStackDepthPushPop: Adjust the current and maximum stack depth.
+//
+// Arguments:
+//    ins - the instruction. Only INS_push and INS_pop adjust the stack depth.
+//
+// Notes:
+//    1. Alters emitCurStackLvl and possibly emitMaxStackDepth.
+//    2. emitCntStackDepth must be set (0 in prolog/epilog, one DWORD elsewhere)
+//
+void emitter::emitAdjustStackDepthPushPop(instruction ins)
+{
+    if (ins == INS_push)
+    {
+        emitCurStackLvl += emitCntStackDepth;
+
+        if (emitMaxStackDepth < emitCurStackLvl)
+        {
+            JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
+            emitMaxStackDepth = emitCurStackLvl;
+        }
+    }
+    else if (ins == INS_pop)
+    {
+        emitCurStackLvl -= emitCntStackDepth;
+        assert((int)emitCurStackLvl >= 0);
+    }
+}
+
+//------------------------------------------------------------------------
+// emitAdjustStackDepth: Adjust the current and maximum stack depth.
+//
+// Arguments:
+//    ins - the instruction. Only INS_add and INS_sub adjust the stack depth.
+//          It is assumed that the add/sub is on the stack pointer.
+//    val - the number of bytes to add to or subtract from the stack pointer.
+//
+// Notes:
+//    1. Alters emitCurStackLvl and possibly emitMaxStackDepth.
+//    2. emitCntStackDepth must be set (0 in prolog/epilog, one DWORD elsewhere)
+//
+void emitter::emitAdjustStackDepth(instruction ins, ssize_t val)
+{
+    // If we're in the prolog or epilog, or otherwise not tracking the stack depth, just return.
+    if (emitCntStackDepth == 0)
+        return;
+
+    if (ins == INS_sub)
+    {
+        S_UINT32 newStackLvl(emitCurStackLvl);
+        newStackLvl += S_UINT32(val);
+        noway_assert(!newStackLvl.IsOverflow());
+
+        emitCurStackLvl = newStackLvl.Value();
+
+        if (emitMaxStackDepth < emitCurStackLvl)
+        {
+            JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
+            emitMaxStackDepth = emitCurStackLvl;
+        }
+    }
+    else if (ins == INS_add)
+    {
+        S_UINT32 newStackLvl = S_UINT32(emitCurStackLvl) - S_UINT32(val);
+        noway_assert(!newStackLvl.IsOverflow());
+
+        emitCurStackLvl = newStackLvl.Value();
+    }
+}
+
+#endif // EMIT_TRACK_STACK_DEPTH
+
+/*****************************************************************************
+ *
+ *  Add a call instruction (direct or indirect).
+ *      argSize<0 means that the caller will pop the arguments
+ *
+ * The other arguments are interpreted depending on callType as shown:
+ * Unless otherwise specified, ireg,xreg,xmul,disp should have default values.
+ *
+ * EC_FUNC_TOKEN       : addr is the method address
+ * EC_FUNC_TOKEN_INDIR : addr is the indirect method address
+ * EC_FUNC_ADDR        : addr is the absolute address of the function
+ * EC_FUNC_VIRTUAL     : "call [ireg+disp]"
+ *
+ * If callType is one of these emitCallTypes, addr has to be NULL.
+ * EC_INDIR_R          : "call ireg".
+ * EC_INDIR_SR         : "call lcl<disp>" (eg. call [ebp-8]).
+ * EC_INDIR_C          : "call clsVar<disp>" (eg. call [clsVarAddr])
+ * EC_INDIR_ARD        : "call [ireg+xreg*xmul+disp]"
+ *
+ */
+
+// clang-format off
+void emitter::emitIns_Call(EmitCallType          callType,
+                           CORINFO_METHOD_HANDLE methHnd,
+                           INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) // used to report call sites to the EE
+                           void*                 addr,
+                           ssize_t               argSize,
+                           emitAttr              retSize
+                           MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+                           VARSET_VALARG_TP      ptrVars,
+                           regMaskTP             gcrefRegs,
+                           regMaskTP             byrefRegs,
+                           IL_OFFSETX            ilOffset, // = BAD_IL_OFFSET
+                           regNumber             ireg,     // = REG_NA
+                           regNumber             xreg,     // = REG_NA
+                           unsigned              xmul,     // = 0
+                           ssize_t               disp,     // = 0
+                           bool                  isJump)   // = false
+// clang-format on
+{
+    /* Sanity check the arguments depending on callType */
+
+    assert(callType < EC_COUNT);
+    assert((callType != EC_FUNC_TOKEN && callType != EC_FUNC_TOKEN_INDIR && callType != EC_FUNC_ADDR) ||
+           (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp == 0));
+    assert(callType != EC_FUNC_VIRTUAL || (ireg < REG_COUNT && xreg == REG_NA && xmul == 0));
+    assert(callType < EC_INDIR_R || callType == EC_INDIR_ARD || callType == EC_INDIR_C || addr == nullptr);
+    assert(callType != EC_INDIR_R || (ireg < REG_COUNT && xreg == REG_NA && xmul == 0 && disp == 0));
+    assert(callType != EC_INDIR_SR ||
+           (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp < (int)emitComp->lvaCount));
+    assert(callType != EC_INDIR_C || (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp != 0));
+
+    // Our stack level should be always greater than the bytes of arguments we push. Just
+    // a sanity test.
+    assert((unsigned)abs((signed)argSize) <= codeGen->genStackLevel);
+
+    // Trim out any callee-trashed registers from the live set.
+    regMaskTP savedSet = emitGetGCRegsSavedOrModified(methHnd);
+    gcrefRegs &= savedSet;
+    byrefRegs &= savedSet;
+
+#ifdef DEBUG
+    if (EMIT_GC_VERBOSE)
+    {
+        printf("\t\t\t\t\t\t\tCall: GCvars=%s ", VarSetOps::ToString(emitComp, ptrVars));
+        dumpConvertedVarSet(emitComp, ptrVars);
+        printf(", gcrefRegs=");
+        printRegMaskInt(gcrefRegs);
+        emitDispRegSet(gcrefRegs);
+        printf(", byrefRegs=");
+        printRegMaskInt(byrefRegs);
+        emitDispRegSet(byrefRegs);
+        printf("\n");
+    }
+#endif
+
+    /* Managed RetVal: emit sequence point for the call */
+    if (emitComp->opts.compDbgInfo && ilOffset != BAD_IL_OFFSET)
+    {
+        codeGen->genIPmappingAdd(ilOffset, false);
+    }
+
+    /*
+        We need to allocate the appropriate instruction descriptor based
+        on whether this is a direct/indirect call, and whether we need to
+        record an updated set of live GC variables.
+
+        The stats for a ton of classes is as follows:
+
+            Direct call w/o  GC vars        220,216
+            Indir. call w/o  GC vars        144,781
+
+            Direct call with GC vars          9,440
+            Indir. call with GC vars          5,768
+     */
+
+    instrDesc* id;
+
+    assert(argSize % REGSIZE_BYTES == 0);
+    int argCnt = (int)(argSize / (int)REGSIZE_BYTES); // we need a signed-divide
+
+    if (callType >= EC_FUNC_VIRTUAL)
+    {
+        /* Indirect call, virtual calls */
+
+        assert(callType == EC_FUNC_VIRTUAL || callType == EC_INDIR_R || callType == EC_INDIR_SR ||
+               callType == EC_INDIR_C || callType == EC_INDIR_ARD);
+
+        id = emitNewInstrCallInd(argCnt, disp, ptrVars, gcrefRegs, byrefRegs,
+                                 retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize));
+    }
+    else
+    {
+        // Helper/static/nonvirtual/function calls (direct or through handle),
+        // and calls to an absolute addr.
+
+        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_TOKEN_INDIR || callType == EC_FUNC_ADDR);
+
+        id = emitNewInstrCallDir(argCnt, ptrVars, gcrefRegs, byrefRegs,
+                                 retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize));
+    }
+
+    /* Update the emitter's live GC ref sets */
+
+    VarSetOps::Assign(emitComp, emitThisGCrefVars, ptrVars);
+    emitThisGCrefRegs = gcrefRegs;
+    emitThisByrefRegs = byrefRegs;
+
+    /* Set the instruction - special case jumping a function */
+    instruction ins = INS_call;
+
+    if (isJump)
+    {
+        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_TOKEN_INDIR || callType == EC_INDIR_ARD);
+        if (callType == EC_FUNC_TOKEN)
+        {
+            ins = INS_l_jmp;
+        }
+        else
+        {
+            ins = INS_i_jmp;
+        }
+    }
+    id->idIns(ins);
+
+    id->idSetIsNoGC(emitNoGChelper(methHnd));
+
+    UNATIVE_OFFSET sz;
+
+    // Record the address: method, indirection, or funcptr
+    if (callType >= EC_FUNC_VIRTUAL)
+    {
+        // This is an indirect call (either a virtual call or func ptr call)
+
+        switch (callType)
+        {
+            case EC_INDIR_C:
+                // Indirect call using an absolute code address.
+                // Must be marked as relocatable and is done at the
+                // branch target location.
+                goto CALL_ADDR_MODE;
+
+            case EC_INDIR_R: // the address is in a register
+
+                id->idSetIsCallRegPtr();
+
+                FALLTHROUGH;
+
+            case EC_INDIR_ARD: // the address is an indirection
+
+                goto CALL_ADDR_MODE;
+
+            case EC_INDIR_SR: // the address is in a lcl var
+
+                id->idInsFmt(IF_SRD);
+                // disp is really a lclVarNum
+                noway_assert((unsigned)disp == (size_t)disp);
+                id->idAddr()->iiaLclVar.initLclVarAddr((unsigned)disp, 0);
+                sz = emitInsSizeSV(id, insCodeMR(INS_call), (unsigned)disp, 0);
+
+                break;
+
+            case EC_FUNC_VIRTUAL:
+
+            CALL_ADDR_MODE:
+
+                // fall-through
+
+                // The function is "ireg" if id->idIsCallRegPtr(),
+                // else [ireg+xmul*xreg+disp]
+
+                id->idInsFmt(IF_ARD);
+
+                id->idAddr()->iiaAddrMode.amBaseReg = ireg;
+                id->idAddr()->iiaAddrMode.amIndxReg = xreg;
+                id->idAddr()->iiaAddrMode.amScale   = xmul ? emitEncodeScale(xmul) : emitter::OPSZ1;
+
+                sz = emitInsSizeAM(id, insCodeMR(INS_call));
+
+                if (ireg == REG_NA && xreg == REG_NA)
+                {
+                    if (codeGen->genCodeIndirAddrNeedsReloc(disp))
+                    {
+                        id->idSetIsDspReloc();
+                    }
+#ifdef TARGET_AMD64
+                    else
+                    {
+                        // An absolute indir address that doesn't need reloc should fit within 32-bits
+                        // to be encoded as offset relative to zero.  This addr mode requires an extra
+                        // SIB byte
+                        noway_assert(static_cast<int>(reinterpret_cast<intptr_t>(addr)) == (size_t)addr);
+                        sz++;
+                    }
+#endif // TARGET_AMD64
+                }
+
+                break;
+
+            default:
+                NO_WAY("unexpected instruction");
+                break;
+        }
+    }
+    else if (callType == EC_FUNC_TOKEN_INDIR)
+    {
+        /* "call [method_addr]" */
+
+        assert(addr != nullptr);
+
+        id->idInsFmt(IF_METHPTR);
+        id->idAddr()->iiaAddr = (BYTE*)addr;
+        sz                    = 6;
+
+        // Since this is an indirect call through a pointer and we don't
+        // currently pass in emitAttr into this function, we query codegen
+        // whether addr needs a reloc.
+        if (codeGen->genCodeIndirAddrNeedsReloc((size_t)addr))
+        {
+            id->idSetIsDspReloc();
+        }
+#ifdef TARGET_AMD64
+        else
+        {
+            // An absolute indir address that doesn't need reloc should fit within 32-bits
+            // to be encoded as offset relative to zero.  This addr mode requires an extra
+            // SIB byte
+            noway_assert(static_cast<int>(reinterpret_cast<intptr_t>(addr)) == (size_t)addr);
+            sz++;
+        }
+#endif // TARGET_AMD64
+    }
+    else
+    {
+        /* This is a simple direct call: "call helper/method/addr" */
+
+        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_ADDR);
+
+        assert(addr != nullptr);
+
+        id->idInsFmt(IF_METHOD);
+        sz = 5;
+
+        id->idAddr()->iiaAddr = (BYTE*)addr;
+
+        if (callType == EC_FUNC_ADDR)
+        {
+            id->idSetIsCallAddr();
+        }
+
+        // Direct call to a method and no addr indirection is needed.
+        if (codeGen->genCodeAddrNeedsReloc((size_t)addr))
+        {
+            id->idSetIsDspReloc();
+        }
+    }
+
+#ifdef DEBUG
+    if (emitComp->verbose && 0)
+    {
+        if (id->idIsLargeCall())
+        {
+            if (callType >= EC_FUNC_VIRTUAL)
+            {
+                printf("[%02u] Rec call GC vars = %s\n", id->idDebugOnlyInfo()->idNum,
+                       VarSetOps::ToString(emitComp, ((instrDescCGCA*)id)->idcGCvars));
+            }
+            else
+            {
+                printf("[%02u] Rec call GC vars = %s\n", id->idDebugOnlyInfo()->idNum,
+                       VarSetOps::ToString(emitComp, ((instrDescCGCA*)id)->idcGCvars));
+            }
+        }
+    }
+
+    id->idDebugOnlyInfo()->idMemCookie = (size_t)methHnd; // method token
+    id->idDebugOnlyInfo()->idCallSig   = sigInfo;
+#endif // DEBUG
+
+#ifdef LATE_DISASM
+    if (addr != nullptr)
+    {
+        codeGen->getDisAssembler().disSetMethod((size_t)addr, methHnd);
+    }
+#endif // LATE_DISASM
+
+    id->idCodeSize(sz);
+
+    dispIns(id);
+    emitCurIGsize += sz;
+
+#if !FEATURE_FIXED_OUT_ARGS
+
+    /* The call will pop the arguments */
+
+    if (emitCntStackDepth && argSize > 0)
+    {
+        noway_assert((ssize_t)emitCurStackLvl >= argSize);
+        emitCurStackLvl -= (int)argSize;
+        assert((int)emitCurStackLvl >= 0);
+    }
+
+#endif // !FEATURE_FIXED_OUT_ARGS
+}
+
+#ifdef DEBUG
+/*****************************************************************************
+ *
+ *  The following called for each recorded instruction -- use for debugging.
+ */
+void emitter::emitInsSanityCheck(instrDesc* id)
+{
+    // make certain you only try to put relocs on things that can have them.
+    ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()];
+    if ((idOp == ID_OP_SCNS) && id->idIsLargeCns())
+    {
+        idOp = ID_OP_CNS;
+    }
+
+    if (id->idIsDspReloc())
+    {
+        assert(idOp == ID_OP_NONE || idOp == ID_OP_AMD || idOp == ID_OP_DSP || idOp == ID_OP_DSP_CNS ||
+               idOp == ID_OP_AMD_CNS || idOp == ID_OP_SPEC || idOp == ID_OP_CALL || idOp == ID_OP_JMP ||
+               idOp == ID_OP_LBL);
+    }
+
+    if (id->idIsCnsReloc())
+    {
+        assert(idOp == ID_OP_CNS || idOp == ID_OP_AMD_CNS || idOp == ID_OP_DSP_CNS || idOp == ID_OP_SPEC ||
+               idOp == ID_OP_CALL || idOp == ID_OP_JMP);
+    }
+}
+#endif
+
+/*****************************************************************************
+ *
+ *  Return the allocated size (in bytes) of the given instruction descriptor.
+ */
+
+size_t emitter::emitSizeOfInsDsc(instrDesc* id)
+{
+    if (emitIsScnsInsDsc(id))
+    {
+        return SMALL_IDSC_SIZE;
+    }
+
+    assert((unsigned)id->idInsFmt() < emitFmtCount);
+
+    ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()];
+
+    // An INS_call instruction may use a "fat" direct/indirect call descriptor
+    // except for a local call to a label (i.e. call to a finally)
+    // Only ID_OP_CALL and ID_OP_SPEC check for this, so we enforce that the
+    //  INS_call instruction always uses one of these idOps
+
+    if (id->idIns() == INS_call)
+    {
+        assert(idOp == ID_OP_CALL || // is a direct   call
+               idOp == ID_OP_SPEC || // is a indirect call
+               idOp == ID_OP_JMP);   // is a local call to finally clause
+    }
+
+    switch (idOp)
+    {
+        case ID_OP_NONE:
+#if FEATURE_LOOP_ALIGN
+            if (id->idIns() == INS_align)
+            {
+                return sizeof(instrDescAlign);
+            }
+#endif
+            break;
+
+        case ID_OP_LBL:
+            return sizeof(instrDescLbl);
+
+        case ID_OP_JMP:
+            return sizeof(instrDescJmp);
+
+        case ID_OP_CALL:
+        case ID_OP_SPEC:
+            if (id->idIsLargeCall())
+            {
+                /* Must be a "fat" indirect call descriptor */
+                return sizeof(instrDescCGCA);
+            }
+
+            FALLTHROUGH;
+
+        case ID_OP_SCNS:
+        case ID_OP_CNS:
+        case ID_OP_DSP:
+        case ID_OP_DSP_CNS:
+            if (id->idIsLargeCns())
+            {
+                if (id->idIsLargeDsp())
+                {
+                    return sizeof(instrDescCnsDsp);
+                }
+                else
+                {
+                    return sizeof(instrDescCns);
+                }
+            }
+            else
+            {
+                if (id->idIsLargeDsp())
+                {
+                    return sizeof(instrDescDsp);
+                }
+                else
+                {
+                    return sizeof(instrDesc);
+                }
+            }
+        case ID_OP_AMD:
+        case ID_OP_AMD_CNS:
+            if (id->idIsLargeCns())
+            {
+                if (id->idIsLargeDsp())
+                {
+                    return sizeof(instrDescCnsAmd);
+                }
+                else
+                {
+                    return sizeof(instrDescCns);
+                }
+            }
+            else
+            {
+                if (id->idIsLargeDsp())
+                {
+                    return sizeof(instrDescAmd);
+                }
+                else
+                {
+                    return sizeof(instrDesc);
+                }
+            }
+
+        default:
+            NO_WAY("unexpected instruction descriptor format");
+            break;
+    }
+
+    return sizeof(instrDesc);
+}
+
+/*****************************************************************************/
+#ifdef DEBUG
+/*****************************************************************************
+ *
+ *  Return a string that represents the given register.
+ */
+
+const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName)
+{
+    static char          rb[2][128];
+    static unsigned char rbc = 0;
+
+    const char* rn = emitComp->compRegVarName(reg, varName);
+
+#ifdef TARGET_AMD64
+    char suffix = '\0';
+
+    switch (EA_SIZE(attr))
+    {
+        case EA_32BYTE:
+            return emitYMMregName(reg);
+
+        case EA_16BYTE:
+            return emitXMMregName(reg);
+
+        case EA_8BYTE:
+            if ((REG_XMM0 <= reg) && (reg <= REG_XMM15))
+            {
+                return emitXMMregName(reg);
+            }
+            break;
+
+        case EA_4BYTE:
+            if ((REG_XMM0 <= reg) && (reg <= REG_XMM15))
+            {
+                return emitXMMregName(reg);
+            }
+
+            if (reg > REG_R15)
+            {
+                break;
+            }
+
+            if (reg > REG_RDI)
+            {
+                suffix = 'd';
+                goto APPEND_SUFFIX;
+            }
+            rbc        = (rbc + 1) % 2;
+            rb[rbc][0] = 'e';
+            rb[rbc][1] = rn[1];
+            rb[rbc][2] = rn[2];
+            rb[rbc][3] = 0;
+            rn         = rb[rbc];
+            break;
+
+        case EA_2BYTE:
+            if (reg > REG_RDI)
+            {
+                suffix = 'w';
+                goto APPEND_SUFFIX;
+            }
+            rn++;
+            break;
+
+        case EA_1BYTE:
+            if (reg > REG_RDI)
+            {
+                suffix = 'b';
+            APPEND_SUFFIX:
+                rbc        = (rbc + 1) % 2;
+                rb[rbc][0] = rn[0];
+                rb[rbc][1] = rn[1];
+                if (rn[2])
+                {
+                    assert(rn[3] == 0);
+                    rb[rbc][2] = rn[2];
+                    rb[rbc][3] = suffix;
+                    rb[rbc][4] = 0;
+                }
+                else
+                {
+                    rb[rbc][2] = suffix;
+                    rb[rbc][3] = 0;
+                }
+            }
+            else
+            {
+                rbc        = (rbc + 1) % 2;
+                rb[rbc][0] = rn[1];
+                if (reg < 4)
+                {
+                    rb[rbc][1] = 'l';
+                    rb[rbc][2] = 0;
+                }
+                else
+                {
+                    rb[rbc][1] = rn[2];
+                    rb[rbc][2] = 'l';
+                    rb[rbc][3] = 0;
+                }
+            }
+
+            rn = rb[rbc];
+            break;
+
+        default:
+            break;
+    }
+#endif // TARGET_AMD64
+
+#ifdef TARGET_X86
+    assert(strlen(rn) >= 3);
+
+    switch (EA_SIZE(attr))
+    {
+        case EA_32BYTE:
+            return emitYMMregName(reg);
+
+        case EA_16BYTE:
+            return emitXMMregName(reg);
+
+        case EA_8BYTE:
+            if ((REG_XMM0 <= reg) && (reg <= REG_XMM7))
+            {
+                return emitXMMregName(reg);
+            }
+            break;
+
+        case EA_4BYTE:
+            if ((REG_XMM0 <= reg) && (reg <= REG_XMM7))
+            {
+                return emitXMMregName(reg);
+            }
+            break;
+
+        case EA_2BYTE:
+            rn++;
+            break;
+
+        case EA_1BYTE:
+            rbc        = (rbc + 1) % 2;
+            rb[rbc][0] = rn[1];
+            rb[rbc][1] = 'l';
+            strcpy_s(&rb[rbc][2], sizeof(rb[0]) - 2, rn + 3);
+
+            rn = rb[rbc];
+            break;
+
+        default:
+            break;
+    }
+#endif // TARGET_X86
+
+#if 0
+    // The following is useful if you want register names to be tagged with * or ^ representing gcref or byref, respectively,
+    // however it's possibly not interesting most of the time.
+    if (EA_IS_GCREF(attr) || EA_IS_BYREF(attr))
+    {
+        if (rn != rb[rbc])
+        {
+            rbc = (rbc+1)%2;
+            strcpy_s(rb[rbc], sizeof(rb[rbc]), rn);
+            rn = rb[rbc];
+        }
+
+        if (EA_IS_GCREF(attr))
+        {
+            strcat_s(rb[rbc], sizeof(rb[rbc]), "*");
+        }
+        else if (EA_IS_BYREF(attr))
+        {
+            strcat_s(rb[rbc], sizeof(rb[rbc]), "^");
+        }
+    }
+#endif // 0
+
+    return rn;
+}
+
+/*****************************************************************************
+ *
+ *  Return a string that represents the given FP register.
+ */
+
+const char* emitter::emitFPregName(unsigned reg, bool varName)
+{
+    assert(reg < REG_COUNT);
+
+    return emitComp->compFPregVarName((regNumber)(reg), varName);
+}
+
+/*****************************************************************************
+ *
+ *  Return a string that represents the given XMM register.
+ */
+
+const char* emitter::emitXMMregName(unsigned reg)
+{
+    static const char* const regNames[] = {
+#define REGDEF(name, rnum, mask, sname) "x" sname,
+#include "register.h"
+    };
+
+    assert(reg < REG_COUNT);
+    assert(reg < _countof(regNames));
+
+    return regNames[reg];
+}
+
+/*****************************************************************************
+ *
+ *  Return a string that represents the given YMM register.
+ */
+
+const char* emitter::emitYMMregName(unsigned reg)
+{
+    static const char* const regNames[] = {
+#define REGDEF(name, rnum, mask, sname) "y" sname,
+#include "register.h"
+    };
+
+    assert(reg < REG_COUNT);
+    assert(reg < _countof(regNames));
+
+    return regNames[reg];
+}
+
+/*****************************************************************************
+ *
+ *  Display a static data member reference.
+ */
+
+void emitter::emitDispClsVar(CORINFO_FIELD_HANDLE fldHnd, ssize_t offs, bool reloc /* = false */)
+{
+    int doffs;
+
+    /* Filter out the special case of fs:[offs] */
+
+    // Munge any pointers if we want diff-able disassembly
+    if (emitComp->opts.disDiffable)
+    {
+        ssize_t top12bits = (offs >> 20);
+        if ((top12bits != 0) && (top12bits != -1))
+        {
+            offs = 0xD1FFAB1E;
+        }
+    }
+
+    if (fldHnd == FLD_GLOBAL_FS)
+    {
+        printf("FS:[0x%04X]", offs);
+        return;
+    }
+
+    if (fldHnd == FLD_GLOBAL_DS)
+    {
+        printf("[0x%04X]", offs);
+        return;
+    }
+
+    printf("[");
+
+    doffs = Compiler::eeGetJitDataOffs(fldHnd);
+
+    if (reloc)
+    {
+        printf("reloc ");
+    }
+
+    if (doffs >= 0)
+    {
+        if (doffs & 1)
+        {
+            printf("@CNS%02u", doffs - 1);
+        }
+        else
+        {
+            printf("@RWD%02u", doffs);
+        }
+
+        if (offs)
+        {
+            printf("%+Id", offs);
+        }
+    }
+    else
+    {
+        printf("classVar[%#x]", emitComp->dspPtr(fldHnd));
+
+        if (offs)
+        {
+            printf("%+Id", offs);
+        }
+    }
+
+    printf("]");
+
+    if (emitComp->opts.varNames && offs < 0)
+    {
+        printf("'%s", emitComp->eeGetFieldName(fldHnd));
+        if (offs)
+        {
+            printf("%+Id", offs);
+        }
+        printf("'");
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Display a stack frame reference.
+ */
+
+void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm)
+{
+    int  addr;
+    bool bEBP;
+
+    printf("[");
+
+    if (!asmfm || emitComp->lvaDoneFrameLayout == Compiler::NO_FRAME_LAYOUT)
+    {
+        if (varx < 0)
+        {
+            printf("TEMP_%02u", -varx);
+        }
+        else
+        {
+            printf("V%02u", +varx);
+        }
+
+        if (disp < 0)
+        {
+            printf("-0x%X", -disp);
+        }
+        else if (disp > 0)
+        {
+            printf("+0x%X", +disp);
+        }
+    }
+
+    if (emitComp->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
+    {
+        if (!asmfm)
+        {
+            printf(" ");
+        }
+
+        addr = emitComp->lvaFrameAddress(varx, &bEBP) + disp;
+
+        if (bEBP)
+        {
+            printf(STR_FPBASE);
+
+            if (addr < 0)
+            {
+                printf("-%02XH", -addr);
+            }
+            else if (addr > 0)
+            {
+                printf("+%02XH", addr);
+            }
+        }
+        else
+        {
+            /* Adjust the offset by amount currently pushed on the stack */
+
+            printf(STR_SPBASE);
+
+            if (addr < 0)
+            {
+                printf("-%02XH", -addr);
+            }
+            else if (addr > 0)
+            {
+                printf("+%02XH", addr);
+            }
+
+#if !FEATURE_FIXED_OUT_ARGS
+
+            if (emitCurStackLvl)
+                printf("+%02XH", emitCurStackLvl);
+
+#endif // !FEATURE_FIXED_OUT_ARGS
+        }
+    }
+
+    printf("]");
+
+    if (varx >= 0 && emitComp->opts.varNames)
+    {
+        LclVarDsc*  varDsc;
+        const char* varName;
+
+        assert((unsigned)varx < emitComp->lvaCount);
+        varDsc  = emitComp->lvaTable + varx;
+        varName = emitComp->compLocalVarName(varx, offs);
+
+        if (varName)
+        {
+            printf("'%s", varName);
+
+            if (disp < 0)
+            {
+                printf("-%d", -disp);
+            }
+            else if (disp > 0)
+            {
+                printf("+%d", +disp);
+            }
+
+            printf("'");
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Display an reloc value
+ *  If we are formatting for an assembly listing don't print the hex value
+ *  since it will prevent us from doing assembly diffs
+ */
+void emitter::emitDispReloc(ssize_t value)
+{
+    if (emitComp->opts.disAsm)
+    {
+        printf("(reloc)");
+    }
+    else
+    {
+        printf("(reloc 0x%Ix)", emitComp->dspPtr(value));
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Display an address mode.
+ */
+
+void emitter::emitDispAddrMode(instrDesc* id, bool noDetail)
+{
+    assert(false);
+}
+
+/*****************************************************************************
+ *
+ *  If the given instruction is a shift, display the 2nd operand.
+ */
+
+void emitter::emitDispShift(instruction ins, int cnt)
+{
+    switch (ins)
+    {
+        case INS_rcl_1:
+        case INS_rcr_1:
+        case INS_rol_1:
+        case INS_ror_1:
+        case INS_shl_1:
+        case INS_shr_1:
+        case INS_sar_1:
+            printf(", 1");
+            break;
+
+        case INS_rcl:
+        case INS_rcr:
+        case INS_rol:
+        case INS_ror:
+        case INS_shl:
+        case INS_shr:
+        case INS_sar:
+            printf(", cl");
+            break;
+
+        case INS_rcl_N:
+        case INS_rcr_N:
+        case INS_rol_N:
+        case INS_ror_N:
+        case INS_shl_N:
+        case INS_shr_N:
+        case INS_sar_N:
+            printf(", %d", cnt);
+            break;
+
+        default:
+            break;
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Display (optionally) the bytes for the instruction encoding in hex
+ */
+
+void emitter::emitDispInsHex(instrDesc* id, BYTE* code, size_t sz)
+{
+    // We do not display the instruction hex if we want diff-able disassembly
+    if (!emitComp->opts.disDiffable)
+    {
+#ifdef TARGET_AMD64
+        // how many bytes per instruction we format for
+        const size_t digits = 10;
+#else // TARGET_X86
+        const size_t digits = 6;
+#endif
+        printf(" ");
+        for (unsigned i = 0; i < sz; i++)
+        {
+            printf("%02X", (*((BYTE*)(code + i))));
+        }
+
+        if (sz < digits)
+        {
+            printf("%.*s", 2 * (digits - sz), "                         ");
+        }
+    }
+}
+
+/*****************************************************************************
+ *
+ *  Display the given instruction.
+ */
+
+void emitter::emitDispIns(
+    instrDesc* id, bool isNew, bool doffs, bool asmfm, unsigned offset, BYTE* code, size_t sz, insGroup* ig)
+{
+    assert(false);
+
+}
+
+/*****************************************************************************/
+#endif
+
+/*****************************************************************************
+ *
+ *  Output nBytes bytes of NOP instructions
+ */
+
+//static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
+//{
+//    assert(nBytes <= 15);
+//
+//#ifndef TARGET_AMD64
+//    // TODO-X86-CQ: when VIA C3 CPU's are out of circulation, switch to the
+//    // more efficient real NOP: 0x0F 0x1F +modR/M
+//    // Also can't use AMD recommended, multiple size prefixes (i.e. 0x66 0x66 0x90 for 3 byte NOP)
+//    // because debugger and msdis don't like it, so maybe VIA doesn't either
+//    // So instead just stick to repeating single byte nops
+//
+//    switch (nBytes)
+//    {
+//        case 15:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 14:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 13:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 12:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 11:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 10:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 9:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 8:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 7:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 6:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 5:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 4:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 3:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 2:
+//            *dst++ = 0x90;
+//            FALLTHROUGH;
+//        case 1:
+//            *dst++ = 0x90;
+//            break;
+//        case 0:
+//            break;
+//    }
+//#else  // TARGET_AMD64
+//    switch (nBytes)
+//    {
+//        case 2:
+//            *dst++ = 0x66;
+//            FALLTHROUGH;
+//        case 1:
+//            *dst++ = 0x90;
+//            break;
+//        case 0:
+//            break;
+//        case 3:
+//            *dst++ = 0x0F;
+//            *dst++ = 0x1F;
+//            *dst++ = 0x00;
+//            break;
+//        case 4:
+//            *dst++ = 0x0F;
+//            *dst++ = 0x1F;
+//            *dst++ = 0x40;
+//            *dst++ = 0x00;
+//            break;
+//        case 6:
+//            *dst++ = 0x66;
+//            FALLTHROUGH;
+//        case 5:
+//            *dst++ = 0x0F;
+//            *dst++ = 0x1F;
+//            *dst++ = 0x44;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            break;
+//        case 7:
+//            *dst++ = 0x0F;
+//            *dst++ = 0x1F;
+//            *dst++ = 0x80;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            break;
+//        case 15:
+//            // More than 3 prefixes is slower than just 2 NOPs
+//            dst = emitOutputNOP(emitOutputNOP(dst, 7), 8);
+//            break;
+//        case 14:
+//            // More than 3 prefixes is slower than just 2 NOPs
+//            dst = emitOutputNOP(emitOutputNOP(dst, 7), 7);
+//            break;
+//        case 13:
+//            // More than 3 prefixes is slower than just 2 NOPs
+//            dst = emitOutputNOP(emitOutputNOP(dst, 5), 8);
+//            break;
+//        case 12:
+//            // More than 3 prefixes is slower than just 2 NOPs
+//            dst = emitOutputNOP(emitOutputNOP(dst, 4), 8);
+//            break;
+//        case 11:
+//            *dst++ = 0x66;
+//            FALLTHROUGH;
+//        case 10:
+//            *dst++ = 0x66;
+//            FALLTHROUGH;
+//        case 9:
+//            *dst++ = 0x66;
+//            FALLTHROUGH;
+//        case 8:
+//            *dst++ = 0x0F;
+//            *dst++ = 0x1F;
+//            *dst++ = 0x84;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            *dst++ = 0x00;
+//            break;
+//    }
+//#endif // TARGET_AMD64
+//
+//    return dst;
+//}
+
+//--------------------------------------------------------------------
+// emitOutputAlign: Outputs NOP to align the loop
+//
+// Arguments:
+//   ig - Current instruction group
+//   id - align instruction that holds amount of padding (NOPs) to add
+//   dst - Destination buffer
+//
+// Return Value:
+//   None.
+//
+// Notes:
+//   Amount of padding needed to align the loop is already calculated. This
+//   method extracts that information and inserts suitable NOP instructions.
+//
+BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction involving an address mode.
+ */
+
+BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction involving a stack frame value.
+ */
+
+BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction with a static data member (class variable).
+ */
+
+BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction with one register operand.
+ */
+
+BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
+{
+    code_t code;
+
+    instruction ins  = id->idIns();
+    regNumber   reg  = id->idReg1();
+    emitAttr    size = id->idOpSize();
+
+    // We would to update GC info correctly
+    assert(!IsSSEInstruction(ins));
+    assert(!IsAVXInstruction(ins));
+
+    // Get the 'base' opcode
+    switch (ins)
+    {
+        case INS_inc:
+        case INS_dec:
+
+#ifdef TARGET_AMD64
+            if (true)
+#else
+            if (size == EA_1BYTE)
+#endif
+            {
+                assert(INS_inc_l == INS_inc + 1);
+                assert(INS_dec_l == INS_dec + 1);
+
+                // Can't use the compact form, use the long form
+                ins = (instruction)(ins + 1);
+                if (size == EA_2BYTE)
+                {
+                    // Output a size prefix for a 16-bit operand
+                    dst += emitOutputByte(dst, 0x66);
+                }
+
+                code = insCodeRR(ins);
+                if (size != EA_1BYTE)
+                {
+                    // Set the 'w' bit to get the large version
+                    code |= 0x1;
+                }
+
+                if (TakesRexWPrefix(ins, size))
+                {
+                    code = AddRexWPrefix(ins, code);
+                }
+
+                // Register...
+                unsigned regcode = insEncodeReg012(ins, reg, size, &code);
+
+                // Output the REX prefix
+                dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+
+                dst += emitOutputWord(dst, code | (regcode << 8));
+            }
+            else
+            {
+                if (size == EA_2BYTE)
+                {
+                    // Output a size prefix for a 16-bit operand
+                    dst += emitOutputByte(dst, 0x66);
+                }
+                dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr));
+            }
+            break;
+
+        case INS_pop:
+        case INS_pop_hide:
+        case INS_push:
+        case INS_push_hide:
+
+            assert(size == EA_PTRSIZE);
+            code = insEncodeOpreg(ins, reg, size);
+
+            assert(!TakesVexPrefix(ins));
+            assert(!TakesRexWPrefix(ins, size));
+
+            // Output the REX prefix
+            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+
+            dst += emitOutputByte(dst, code);
+            break;
+
+        case INS_bswap:
+        {
+            assert(size >= EA_4BYTE && size <= EA_PTRSIZE); // 16-bit BSWAP is undefined
+
+            // The Intel instruction set reference for BSWAP states that extended registers
+            // should be enabled via REX.R, but per Vol. 2A, Sec. 2.2.1.2 (see also Figure 2-7),
+            // REX.B should instead be used if the register is encoded in the opcode byte itself.
+            // Therefore the default logic of insEncodeReg012 is correct for this case.
+
+            code = insCodeRR(ins);
+
+            if (TakesRexWPrefix(ins, size))
+            {
+                code = AddRexWPrefix(ins, code);
+            }
+
+            // Register...
+            unsigned regcode = insEncodeReg012(ins, reg, size, &code);
+
+            // Output the REX prefix
+            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+
+            dst += emitOutputWord(dst, code | (regcode << 8));
+            break;
+        }
+
+        case INS_seto:
+        case INS_setno:
+        case INS_setb:
+        case INS_setae:
+        case INS_sete:
+        case INS_setne:
+        case INS_setbe:
+        case INS_seta:
+        case INS_sets:
+        case INS_setns:
+        case INS_setp:
+        case INS_setnp:
+        case INS_setl:
+        case INS_setge:
+        case INS_setle:
+        case INS_setg:
+
+            assert(id->idGCref() == GCT_NONE);
+            assert(size == EA_1BYTE);
+
+            code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins));
+
+            // Output the REX prefix
+            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+
+            // We expect this to always be a 'big' opcode
+            assert(code & 0x00FF0000);
+
+            dst += emitOutputByte(dst, code >> 16);
+            dst += emitOutputWord(dst, code & 0x0000FFFF);
+
+            break;
+
+        case INS_mulEAX:
+        case INS_imulEAX:
+
+            // Kill off any GC refs in EAX or EDX
+            emitGCregDeadUpd(REG_EAX, dst);
+            emitGCregDeadUpd(REG_EDX, dst);
+
+            FALLTHROUGH;
+
+        default:
+
+            assert(id->idGCref() == GCT_NONE);
+
+            code = insEncodeMRreg(ins, reg, size, insCodeMR(ins));
+
+            if (size != EA_1BYTE)
+            {
+                // Set the 'w' bit to get the large version
+                code |= 0x1;
+
+                if (size == EA_2BYTE)
+                {
+                    // Output a size prefix for a 16-bit operand
+                    dst += emitOutputByte(dst, 0x66);
+                }
+            }
+
+            code = AddVexPrefixIfNeeded(ins, code, size);
+
+            if (TakesRexWPrefix(ins, size))
+            {
+                code = AddRexWPrefix(ins, code);
+            }
+
+            // Output the REX prefix
+            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+
+            dst += emitOutputWord(dst, code);
+            break;
+    }
+
+    // Are we writing the register? if so then update the GC information
+    switch (id->idInsFmt())
+    {
+        case IF_RRD:
+            break;
+        case IF_RWR:
+            if (id->idGCref())
+            {
+                emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
+            }
+            else
+            {
+                emitGCregDeadUpd(id->idReg1(), dst);
+            }
+            break;
+        case IF_RRW:
+        {
+#ifdef DEBUG
+            regMaskTP regMask = genRegMask(reg);
+#endif
+            if (id->idGCref())
+            {
+                // The reg must currently be holding either a gcref or a byref
+                // and the instruction must be inc or dec
+                assert(((emitThisGCrefRegs | emitThisByrefRegs) & regMask) &&
+                       (ins == INS_inc || ins == INS_dec || ins == INS_inc_l || ins == INS_dec_l));
+                assert(id->idGCref() == GCT_BYREF);
+                // Mark it as holding a GCT_BYREF
+                emitGCregLiveUpd(GCT_BYREF, id->idReg1(), dst);
+            }
+            else
+            {
+                // Can't use RRW to trash a GC ref.  It's OK for unverifiable code
+                // to trash Byrefs.
+                assert((emitThisGCrefRegs & regMask) == 0);
+            }
+        }
+        break;
+        default:
+#ifdef DEBUG
+            emitDispIns(id, false, false, false);
+#endif
+            assert(!"unexpected instruction format");
+            break;
+    }
+
+    return dst;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction with two register operands.
+ */
+
+BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
+{
+    assert(false);
+    return 0;
+}
+
+BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction with a register and constant operands.
+ */
+
+BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Output an instruction with a constant operand.
+ */
+
+BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id)
+{
+    code_t      code;
+    instruction ins       = id->idIns();
+    emitAttr    size      = id->idOpSize();
+    ssize_t     val       = emitGetInsSC(id);
+    bool        valInByte = ((signed char)val == (target_ssize_t)val);
+
+    // We would to update GC info correctly
+    assert(!IsSSEInstruction(ins));
+    assert(!IsAVXInstruction(ins));
+
+#ifdef TARGET_AMD64
+    // all these opcodes take a sign-extended 4-byte immediate, max
+    noway_assert(size < EA_8BYTE || ((int)val == val && !id->idIsCnsReloc()));
+#endif
+
+    if (id->idIsCnsReloc())
+    {
+        valInByte = false; // relocs can't be placed in a byte
+
+        // Of these instructions only the push instruction can have reloc
+        assert(ins == INS_push || ins == INS_push_hide);
+    }
+
+    switch (ins)
+    {
+        case INS_jge:
+            assert((val >= -128) && (val <= 127));
+            dst += emitOutputByte(dst, insCode(ins));
+            dst += emitOutputByte(dst, val);
+            break;
+
+        case INS_loop:
+            assert((val >= -128) && (val <= 127));
+            dst += emitOutputByte(dst, insCodeMI(ins));
+            dst += emitOutputByte(dst, val);
+            break;
+
+        case INS_ret:
+            assert(val);
+            dst += emitOutputByte(dst, insCodeMI(ins));
+            dst += emitOutputWord(dst, val);
+            break;
+
+        case INS_push_hide:
+        case INS_push:
+            code = insCodeMI(ins);
+
+            // Does the operand fit in a byte?
+            if (valInByte)
+            {
+                dst += emitOutputByte(dst, code | 2);
+                dst += emitOutputByte(dst, val);
+            }
+            else
+            {
+                if (TakesRexWPrefix(ins, size))
+                {
+                    code = AddRexWPrefix(ins, code);
+                    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+                }
+
+                dst += emitOutputByte(dst, code);
+                dst += emitOutputLong(dst, val);
+                if (id->idIsCnsReloc())
+                {
+                    emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW);
+                }
+            }
+
+            // Did we push a GC ref value?
+            if (id->idGCref())
+            {
+#ifdef DEBUG
+                printf("UNDONE: record GCref push [cns]\n");
+#endif
+            }
+
+            break;
+
+        default:
+            assert(!"unexpected instruction");
+    }
+
+    return dst;
+}
+
+/*****************************************************************************
+ *
+ *  Output a local jump instruction.
+ *  This function also handles non-jumps that have jump-like characteristics, like RIP-relative LEA of a label that
+ *  needs to get bound to an actual address and processed by branch shortening.
+ */
+
+BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
+{
+    assert(false);
+    return 0;
+}
+
+/*****************************************************************************
+ *
+ *  Append the machine code corresponding to the given instruction descriptor
+ *  to the code block at '*dp'; the base of the code block is 'bp', and 'ig'
+ *  is the instruction group that contains the instruction. Updates '*dp' to
+ *  point past the generated code, and returns the size of the instruction
+ *  descriptor in bytes.
+ */
+
+#ifdef _PREFAST_
+#pragma warning(push)
+#pragma warning(disable : 21000) // Suppress PREFast warning about overly large function
+#endif
+size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
+{
+    assert(false);
+    return 0;
+}
+#ifdef _PREFAST_
+#pragma warning(pop)
+#endif
+
+//emitter::insFormat emitter::getMemoryOperation(instrDesc* id)
+//{
+//    assert(false);
+//}
+
+#if defined(DEBUG) || defined(LATE_DISASM)
+
+//----------------------------------------------------------------------------------------
+// getInsExecutionCharacteristics:
+//    Returns the current instruction execution characteristics
+//
+// Arguments:
+//    id  - The current instruction descriptor to be evaluated
+//
+// Return Value:
+//    A struct containing the current instruction execution characteristics
+//
+// Notes:
+//    The instruction latencies and throughput values returned by this function
+//    are for the Intel Skylake-X processor and are from either:
+//      1.  Agner.org - https://www.agner.org/optimize/instruction_tables.pdf
+//      2.  uops.info - https://uops.info/table.html
+//
+emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(instrDesc* id)
+{
+    assert(false);
+    insExecutionCharacteristics result;
+    result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL;
+    return result;
+}
+
+#endif // defined(DEBUG) || defined(LATE_DISASM)
+
+/*****************************************************************************/
+/*****************************************************************************/
+
+#endif // defined(TARGET_XARCH)
diff --git a/src/coreclr/jit/emitwasm.h b/src/coreclr/jit/emitwasm.h
new file mode 100644
index 000000000000..812278701f2f
--- /dev/null
+++ b/src/coreclr/jit/emitwasm.h
@@ -0,0 +1,573 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+/************************************************************************/
+/*           Public inline informational methods                        */
+/************************************************************************/
+
+public:
+inline static bool isGeneralRegister(regNumber reg)
+{
+    return (reg <= REG_INT_LAST);
+}
+
+inline static bool isFloatReg(regNumber reg)
+{
+    return (reg >= REG_FP_FIRST && reg <= REG_FP_LAST);
+}
+
+inline static bool isDoubleReg(regNumber reg)
+{
+    return isFloatReg(reg);
+}
+
+/************************************************************************/
+/*         Routines that compute the size of / encode instructions      */
+/************************************************************************/
+
+// code_t is a type used to accumulate bits of opcode + prefixes. On amd64, it must be 64 bits
+// to support the REX prefixes. On both x86 and amd64, it must be 64 bits to support AVX, with
+// its 3-byte VEX prefix.
+typedef unsigned __int64 code_t;
+
+struct CnsVal
+{
+    ssize_t cnsVal;
+    bool    cnsReloc;
+};
+
+UNATIVE_OFFSET emitInsSize(code_t code);
+UNATIVE_OFFSET emitInsSizeSV(code_t code, int var, int dsp);
+UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp);
+UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val);
+UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code);
+UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code, int val);
+UNATIVE_OFFSET emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr);
+UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code);
+UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
+UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
+UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val);
+
+BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst);
+BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
+BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
+BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
+
+BYTE* emitOutputR(BYTE* dst, instrDesc* id);
+BYTE* emitOutputRI(BYTE* dst, instrDesc* id);
+BYTE* emitOutputRR(BYTE* dst, instrDesc* id);
+BYTE* emitOutputIV(BYTE* dst, instrDesc* id);
+
+BYTE* emitOutputRRR(BYTE* dst, instrDesc* id);
+
+BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
+
+unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
+unsigned emitGetRexPrefixSize(instruction ins);
+unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
+unsigned emitGetPrefixSize(code_t code);
+unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
+
+unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
+unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);
+code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code);
+unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code);
+
+code_t insEncodeMRreg(instruction ins, code_t code);
+code_t insEncodeRMreg(instruction ins, code_t code);
+code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code);
+code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size);
+code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size);
+
+unsigned insSSval(unsigned scale);
+
+bool IsAVXInstruction(instruction ins);
+code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code);
+
+code_t AddRexWPrefix(instruction ins, code_t code);
+code_t AddRexRPrefix(instruction ins, code_t code);
+code_t AddRexXPrefix(instruction ins, code_t code);
+code_t AddRexBPrefix(instruction ins, code_t code);
+code_t AddRexPrefix(instruction ins, code_t code);
+
+bool EncodedBySSE38orSSE3A(instruction ins);
+bool Is4ByteSSEInstruction(instruction ins);
+
+bool AreUpper32BitsZero(regNumber reg);
+
+bool AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, bool needsOCFlags);
+
+bool hasRexPrefix(code_t code)
+{
+#ifdef TARGET_AMD64
+    const code_t REX_PREFIX_MASK = 0xFF00000000LL;
+    return (code & REX_PREFIX_MASK) != 0;
+#else  // !TARGET_AMD64
+    return false;
+#endif // !TARGET_AMD64
+}
+
+// 3-byte VEX prefix starts with byte 0xC4
+#define VEX_PREFIX_MASK_3BYTE 0xFF000000000000ULL
+#define VEX_PREFIX_CODE_3BYTE 0xC4000000000000ULL
+
+bool TakesVexPrefix(instruction ins);
+
+// Returns true if the instruction encoding already contains VEX prefix
+bool hasVexPrefix(code_t code)
+{
+    return (code & VEX_PREFIX_MASK_3BYTE) == VEX_PREFIX_CODE_3BYTE;
+}
+code_t AddVexPrefix(instruction ins, code_t code, emitAttr attr);
+code_t AddVexPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
+{
+    if (TakesVexPrefix(ins))
+    {
+        code = AddVexPrefix(ins, code, size);
+    }
+    return code;
+}
+code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size)
+{
+    if (TakesVexPrefix(ins) && !hasVexPrefix(code))
+    {
+        code = AddVexPrefix(ins, code, size);
+    }
+    return code;
+}
+
+bool useVEXEncodings;
+bool UseVEXEncoding()
+{
+    return useVEXEncodings;
+}
+void SetUseVEXEncoding(bool value)
+{
+    useVEXEncodings = value;
+}
+
+bool containsAVXInstruction = false;
+bool ContainsAVX()
+{
+    return containsAVXInstruction;
+}
+void SetContainsAVX(bool value)
+{
+    containsAVXInstruction = value;
+}
+
+bool contains256bitAVXInstruction = false;
+bool Contains256bitAVX()
+{
+    return contains256bitAVXInstruction;
+}
+void SetContains256bitAVX(bool value)
+{
+    contains256bitAVXInstruction = value;
+}
+
+bool IsDstDstSrcAVXInstruction(instruction ins);
+bool IsDstSrcSrcAVXInstruction(instruction ins);
+bool IsThreeOperandAVXInstruction(instruction ins)
+{
+    return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins));
+}
+bool isAvxBlendv(instruction ins)
+{
+    return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
+}
+bool isSse41Blendv(instruction ins)
+{
+    return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
+}
+bool isPrefetch(instruction ins)
+{
+    return (ins == INS_prefetcht0) || (ins == INS_prefetcht1) || (ins == INS_prefetcht2) || (ins == INS_prefetchnta);
+}
+
+/************************************************************************/
+/*             Debug-only routines to display instructions              */
+/************************************************************************/
+
+#ifdef DEBUG
+
+const char* emitFPregName(unsigned reg, bool varName = true);
+
+void emitDispReloc(ssize_t value);
+void emitDispAddrMode(instrDesc* id, bool noDetail = false);
+void emitDispShift(instruction ins, int cnt = 0);
+
+void emitDispIns(instrDesc* id,
+                 bool       isNew,
+                 bool       doffs,
+                 bool       asmfm,
+                 unsigned   offs = 0,
+                 BYTE*      code = nullptr,
+                 size_t     sz   = 0,
+                 insGroup*  ig   = nullptr);
+
+const char* emitXMMregName(unsigned reg);
+const char* emitYMMregName(unsigned reg);
+
+#endif
+
+/************************************************************************/
+/*  Private members that deal with target-dependent instr. descriptors  */
+/************************************************************************/
+
+private:
+void emitSetAmdDisp(instrDescAmd* id, ssize_t dsp);
+instrDesc* emitNewInstrAmd(emitAttr attr, ssize_t dsp);
+instrDesc* emitNewInstrAmdCns(emitAttr attr, ssize_t dsp, int cns);
+
+instrDesc* emitNewInstrCallDir(int              argCnt,
+                               VARSET_VALARG_TP GCvars,
+                               regMaskTP        gcrefRegs,
+                               regMaskTP        byrefRegs,
+                               emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize));
+
+instrDesc* emitNewInstrCallInd(int              argCnt,
+                               ssize_t          disp,
+                               VARSET_VALARG_TP GCvars,
+                               regMaskTP        gcrefRegs,
+                               regMaskTP        byrefRegs,
+                               emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize));
+
+void emitGetInsCns(instrDesc* id, CnsVal* cv);
+ssize_t emitGetInsAmdCns(instrDesc* id, CnsVal* cv);
+void emitGetInsDcmCns(instrDesc* id, CnsVal* cv);
+ssize_t emitGetInsAmdAny(instrDesc* id);
+
+/************************************************************************/
+/*               Private helpers for instruction output                 */
+/************************************************************************/
+
+private:
+insFormat emitInsModeFormat(instruction ins, insFormat base, insFormat FPld, insFormat FPst);
+
+bool emitVerifyEncodable(instruction ins, emitAttr size, regNumber reg1, regNumber reg2 = REG_NA);
+
+bool emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id);
+
+#if FEATURE_FIXED_OUT_ARGS
+void emitAdjustStackDepthPushPop(instruction ins)
+{
+}
+void emitAdjustStackDepth(instruction ins, ssize_t val)
+{
+}
+#else  // !FEATURE_FIXED_OUT_ARGS
+void emitAdjustStackDepthPushPop(instruction ins);
+void emitAdjustStackDepth(instruction ins, ssize_t val);
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+/*****************************************************************************
+*
+*  Convert between an index scale in bytes to a smaller encoding used for
+*  storage in instruction descriptors.
+*/
+
+inline emitter::opSize emitEncodeScale(size_t scale)
+{
+    assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
+
+    return emitSizeEncode[scale - 1];
+}
+
+inline emitAttr emitDecodeScale(unsigned ensz)
+{
+    assert(ensz < 4);
+
+    return emitter::emitSizeDecode[ensz];
+}
+
+/************************************************************************/
+/*           The public entry points to output instructions             */
+/************************************************************************/
+
+public:
+void emitLoopAlign(unsigned short paddingBytes = 15);
+
+void emitLongLoopAlign(unsigned short alignmentBoundary);
+
+void emitIns(instruction ins);
+
+void emitIns(instruction ins, emitAttr attr);
+
+void emitInsRMW(instruction inst, emitAttr attr, GenTreeStoreInd* storeInd, GenTree* src);
+
+void emitInsRMW(instruction inst, emitAttr attr, GenTreeStoreInd* storeInd);
+
+void emitIns_Nop(unsigned size);
+
+void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val);
+
+void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
+
+void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs);
+
+void emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, ssize_t val);
+
+void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2);
+
+void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival);
+
+void emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs);
+
+void emitIns_AR_R_R(instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs);
+
+void emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir);
+
+void emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival);
+
+void emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival);
+
+void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
+
+void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival);
+
+void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir);
+
+void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs);
+
+void emitIns_R_AR_R(instruction ins,
+                    emitAttr    attr,
+                    regNumber   reg1,
+                    regNumber   reg2,
+                    regNumber   base,
+                    regNumber   index,
+                    int         scale,
+                    int         offs);
+
+void emitIns_R_R_C(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs);
+
+void emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs);
+
+void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3);
+
+void emitIns_R_R_A_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt);
+void emitIns_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival);
+void emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival);
+
+void emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm);
+
+void emitIns_R_R_C_I(
+    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
+
+void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, int ival);
+
+void emitIns_R_R_S_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival);
+
+void emitIns_R_R_A_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir);
+
+void emitIns_R_R_AR_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs);
+
+void emitIns_R_R_C_R(instruction          ins,
+                     emitAttr             attr,
+                     regNumber            targetReg,
+                     regNumber            op1Reg,
+                     regNumber            op3Reg,
+                     CORINFO_FIELD_HANDLE fldHnd,
+                     int                  offs);
+
+void emitIns_R_R_S_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs);
+
+void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4);
+
+void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
+
+void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
+
+void emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
+
+void emitIns_S_I(instruction ins, emitAttr attr, int varx, int offs, int val);
+
+void emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
+
+void emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, regNumber reg, int offs);
+
+void emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs, int val);
+
+void emitIns_IJ(emitAttr attr, regNumber reg, unsigned base);
+
+void emitIns_J_S(instruction ins, emitAttr attr, BasicBlock* dst, int varx, int offs);
+
+void emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNumber reg);
+
+void emitIns_R_D(instruction ins, emitAttr attr, unsigned offs, regNumber reg);
+
+void emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int offs);
+
+void emitIns_I_AI(instruction ins, emitAttr attr, int val, ssize_t disp);
+
+void emitIns_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber base, int disp);
+
+void emitIns_R_AI(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp);
+
+void emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp);
+
+void emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp);
+
+void emitIns_I_ARR(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, int disp);
+
+void emitIns_R_ARR(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp);
+
+void emitIns_ARR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp);
+
+void emitIns_I_ARX(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, unsigned mul, int disp);
+
+void emitIns_R_ARX(
+    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp);
+
+void emitIns_ARX_R(instruction    ins,
+                   emitAttr       attr,
+                   regNumber      reg,
+                   regNumber      base,
+                   regNumber      index,
+                   unsigned       scale,
+                   cnsval_ssize_t disp);
+
+void emitIns_I_AX(instruction ins, emitAttr attr, int val, regNumber reg, unsigned mul, int disp);
+
+void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
+
+void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
+
+void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival);
+
+void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset);
+void emitIns_SIMD_R_R_C(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
+void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg);
+void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs);
+
+#ifdef FEATURE_HW_INTRINSICS
+void emitIns_SIMD_R_R_A_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival);
+void emitIns_SIMD_R_R_AR_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival);
+void emitIns_SIMD_R_R_C_I(instruction          ins,
+                          emitAttr             attr,
+                          regNumber            targetReg,
+                          regNumber            op1Reg,
+                          CORINFO_FIELD_HANDLE fldHnd,
+                          int                  offs,
+                          int                  ival);
+void emitIns_SIMD_R_R_R_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int ival);
+void emitIns_SIMD_R_R_S_I(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, int ival);
+
+void emitIns_SIMD_R_R_R_A(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base);
+void emitIns_SIMD_R_R_R_C(instruction          ins,
+                          emitAttr             attr,
+                          regNumber            targetReg,
+                          regNumber            op1Reg,
+                          regNumber            op2Reg,
+                          CORINFO_FIELD_HANDLE fldHnd,
+                          int                  offs);
+void emitIns_SIMD_R_R_R_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg);
+void emitIns_SIMD_R_R_R_S(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);
+
+void emitIns_SIMD_R_R_A_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir);
+void emitIns_SIMD_R_R_AR_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base);
+void emitIns_SIMD_R_R_C_R(instruction          ins,
+                          emitAttr             attr,
+                          regNumber            targetReg,
+                          regNumber            op1Reg,
+                          regNumber            op2Reg,
+                          CORINFO_FIELD_HANDLE fldHnd,
+                          int                  offs);
+void emitIns_SIMD_R_R_S_R(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);
+#endif // FEATURE_HW_INTRINSICS
+
+enum EmitCallType
+{
+    EC_FUNC_TOKEN,       //   Direct call to a helper/static/nonvirtual/global method
+    EC_FUNC_TOKEN_INDIR, // Indirect call to a helper/static/nonvirtual/global method
+    EC_FUNC_ADDR,        // Direct call to an absolute address
+
+    EC_FUNC_VIRTUAL, // Call to a virtual method (using the vtable)
+    EC_INDIR_R,      // Indirect call via register
+    EC_INDIR_SR,     // Indirect call via stack-reference (local var)
+    EC_INDIR_C,      // Indirect call via static class var
+    EC_INDIR_ARD,    // Indirect call via an addressing mode
+
+    EC_COUNT
+};
+
+// clang-format off
+void emitIns_Call(EmitCallType          callType,
+                  CORINFO_METHOD_HANDLE methHnd,
+                  INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) // used to report call sites to the EE
+                  void*                 addr,
+                  ssize_t               argSize,
+                  emitAttr              retSize
+                  MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
+                  VARSET_VALARG_TP      ptrVars,
+                  regMaskTP             gcrefRegs,
+                  regMaskTP             byrefRegs,
+                  IL_OFFSETX            ilOffset = BAD_IL_OFFSET,
+                  regNumber             ireg     = REG_NA,
+                  regNumber             xreg     = REG_NA,
+                  unsigned              xmul     = 0,
+                  ssize_t               disp     = 0,
+                  bool                  isJump   = false);
+// clang-format on
+
+#ifdef TARGET_AMD64
+// Is the last instruction emitted a call instruction?
+bool emitIsLastInsCall();
+
+// Insert a NOP at the end of the the current instruction group if the last emitted instruction was a 'call',
+// because the next instruction group will be an epilog.
+void emitOutputPreEpilogNOP();
+#endif // TARGET_AMD64
+
+/*****************************************************************************
+ *
+ *  Given a jump, return true if it's a conditional jump.
+ */
+
+inline bool emitIsCondJump(instrDesc* jmp)
+{
+    instruction ins = jmp->idIns();
+
+    assert(jmp->idInsFmt() == IF_LABEL);
+
+    return (ins != INS_call && ins != INS_jmp);
+}
+
+/*****************************************************************************
+ *
+ *  Given a jump, return true if it's an unconditional jump.
+ */
+
+inline bool emitIsUncondJump(instrDesc* jmp)
+{
+    instruction ins = jmp->idIns();
+
+    assert(jmp->idInsFmt() == IF_LABEL);
+
+    return (ins == INS_jmp);
+}
+
+#endif // TARGET_XARCH
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index ec9c091b5d45..aaa07b087400 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -15,7 +15,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #pragma hdrstop
 #endif
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) 
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/error.h b/src/coreclr/jit/error.h
index a63643c0ee5b..7856b4a974b3 100644
--- a/src/coreclr/jit/error.h
+++ b/src/coreclr/jit/error.h
@@ -174,6 +174,8 @@ extern void notYetImplemented(const char* msg, const char* file, unsigned line);
 #define NYI_X86(msg)    do { } while (0)
 #define NYI_ARM(msg)    do { } while (0)
 #define NYI_ARM64(msg)  do { } while (0)
+#define NYI_WASM32(msg)  do { } while (0)
+#define NYI_WASM64(msg) do { } while (0)
 
 #elif defined(TARGET_X86)
 
@@ -181,6 +183,8 @@ extern void notYetImplemented(const char* msg, const char* file, unsigned line);
 #define NYI_X86(msg)    NYIRAW("NYI_X86: " msg)
 #define NYI_ARM(msg)    do { } while (0)
 #define NYI_ARM64(msg)  do { } while (0)
+#define NYI_WASM32(msg)  do { } while (0)
+#define NYI_WASM64(msg) do { } while (0)
 
 #elif defined(TARGET_ARM)
 
@@ -188,6 +192,8 @@ extern void notYetImplemented(const char* msg, const char* file, unsigned line);
 #define NYI_X86(msg)    do { } while (0)
 #define NYI_ARM(msg)    NYIRAW("NYI_ARM: " msg)
 #define NYI_ARM64(msg)  do { } while (0)
+#define NYI_WASM32(msg)  do { } while (0)
+#define NYI_WASM64(msg) do { } while (0)
 
 #elif defined(TARGET_ARM64)
 
@@ -195,6 +201,26 @@ extern void notYetImplemented(const char* msg, const char* file, unsigned line);
 #define NYI_X86(msg)    do { } while (0)
 #define NYI_ARM(msg)    do { } while (0)
 #define NYI_ARM64(msg)  NYIRAW("NYI_ARM64: " msg)
+#define NYI_WASM32(msg)  do { } while (0)
+#define NYI_WASM64(msg) do { } while (0)
+
+#elif defined(TARGET_WASM64)
+
+#define NYI_AMD64(msg)  do { } while (0)
+#define NYI_X86(msg)    do { } while (0)
+#define NYI_ARM(msg)    do { } while (0)
+#define NYI_ARM64(msg)  do { } while (0)
+#define NYI_WASM32(msg)  do { } while (0)
+#define NYI_WASM64(msg) NYIRAW("NYI_WASM64: " msg)
+
+#elif defined(TARGET_WASM32)
+
+#define NYI_AMD64(msg)  do { } while (0)
+#define NYI_X86(msg)    do { } while (0)
+#define NYI_ARM(msg)    do { } while (0)
+#define NYI_ARM64(msg)  do { } while (0)
+#define NYI_WASM32(msg) NYIRAW("NYI_WASM32: " msg)
+#define NYI_WASM64(msg) do { } while (0)
 
 #else
 
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index d65a21f7784a..18ced83e4da2 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -2894,7 +2894,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // nodes with GTF_ADDRMODE_NO_CSE and calculate a more accurate cost.
 
         addr->gtFlags |= GTF_ADDRMODE_NO_CSE;
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         // addrmodeCount is the count of items that we used to form
         // an addressing mode.  The maximum value is 4 when we have
         // all of these:   { base, idx, cns, mul }
@@ -3025,7 +3025,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
                 }
             }
         }
-#elif defined TARGET_ARM64
+#elif defined TARGET_ARM64 
         if (base)
         {
             *pCostEx += base->GetCostEx();
@@ -3331,7 +3331,7 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                 goto COMMON_CNS;
             }
 
-#elif defined TARGET_XARCH
+#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 
             case GT_CNS_STR:
 #ifdef TARGET_AMD64
@@ -3660,6 +3660,16 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                     costEx = 1;
                     costSz = 2;
 
+                    if (isflt || varTypeIsFloating(op1->TypeGet()))
+                    {
+                        /* cast involving floats always go through memory */
+                        costEx = IND_COST_EX * 2;
+                        costSz = 6;
+                    }
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+                    costEx = 1;
+                    costSz = 2;
+
                     if (isflt || varTypeIsFloating(op1->TypeGet()))
                     {
                         /* cast involving floats always go through memory */
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index 165c98ce2471..d3feb36c64f6 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -7363,7 +7363,7 @@ inline unsigned GenTree::GetMultiRegCount()
     }
 #endif
 
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
     if (OperIsMultiRegOp())
     {
         return AsMultiRegOp()->GetRegCount();
@@ -7432,7 +7432,7 @@ inline regNumber GenTree::GetRegByIndex(int regIndex)
         return AsPutArgSplit()->GetRegNumByIdx(regIndex);
     }
 #endif
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
     if (OperIsMultiRegOp())
     {
         return AsMultiRegOp()->GetRegNumByIdx(regIndex);
@@ -7492,7 +7492,7 @@ inline var_types GenTree::GetRegTypeByIndex(int regIndex)
         return AsPutArgSplit()->GetRegType(regIndex);
     }
 #endif
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
     if (OperIsMultiRegOp())
     {
         return AsMultiRegOp()->GetRegType(regIndex);
@@ -7556,7 +7556,7 @@ inline unsigned int GenTree::GetRegSpillFlagByIdx(int regIndex) const
         return AsPutArgSplit()->GetRegSpillFlagByIdx(regIndex);
     }
 #endif
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
     if (OperIsMultiRegOp())
     {
         return AsMultiRegOp()->GetRegSpillFlagByIdx(regIndex);
diff --git a/src/coreclr/jit/gtstructs.h b/src/coreclr/jit/gtstructs.h
index b4bad947fd90..2947e36d5f76 100644
--- a/src/coreclr/jit/gtstructs.h
+++ b/src/coreclr/jit/gtstructs.h
@@ -122,6 +122,8 @@ GTSTRUCT_2(CC          , GT_JCC, GT_SETCC)
 GTSTRUCT_1(MultiRegOp  , GT_MUL_LONG)
 #elif defined (TARGET_ARM)
 GTSTRUCT_3(MultiRegOp  , GT_MUL_LONG, GT_PUTARG_REG, GT_BITCAST)
+#elif defined (TARGET_WASM32)
+GTSTRUCT_3(MultiRegOp, GT_MUL_LONG, GT_PUTARG_REG, GT_BITCAST)
 #endif
 /*****************************************************************************/
 #undef  GTSTRUCT_0
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index 180c991148ab..eec01b573e51 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -3783,7 +3783,7 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
         GenTree* op1;
         GenTree* op2;
 
-#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
+#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         // TODO-ARM-CQ: reenable treating Interlocked operation as intrinsic
 
         // Note that CORINFO_INTRINSIC_InterlockedAdd32/64 are not actually used.
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index 06ef501a5220..5d187615ad28 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -65,6 +65,15 @@ const char* CodeGen::genInsName(instruction ins)
         #define INST9(id, nm, ldst, fmt, e1, e2, e3, e4, e5, e6, e7, e8, e9 ) nm,
         #include "instrs.h"
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+        #define INST0(id, nm, um, mr,                 flags) nm,
+        #define INST1(id, nm, um, mr,                 flags) nm,
+        #define INST2(id, nm, um, mr, mi,             flags) nm,
+        #define INST3(id, nm, um, mr, mi, rm,         flags) nm,
+        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) nm,
+        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) nm,
+        #include "instrs.h"
+
 #else
 #error "Unknown TARGET"
 #endif
@@ -1810,6 +1819,20 @@ instruction CodeGen::ins_Copy(var_types dstType)
     {
         return INS_mov;
     }
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    if (varTypeIsSIMD(dstType))
+    {
+        return INS_movaps;
+    }
+    else if (varTypeIsFloating(dstType))
+    {
+        // Both float and double copy can use movaps
+        return INS_movaps;
+    }
+    else
+    {
+        return INS_mov;
+    }
 #else // TARGET_*
 #error "Unknown TARGET_"
 #endif
@@ -1835,7 +1858,7 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType)
     {
         return ins_Copy(dstType);
     }
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
     if (dstIsFloatReg)
     {
         return INS_mov_i2xmm;
@@ -1990,7 +2013,7 @@ instruction CodeGenInterface::ins_StoreFromSrc(regNumber srcReg, var_types dstTy
     return ins;
 }
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
 bool CodeGen::isMoveIns(instruction ins)
 {
@@ -2374,7 +2397,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
     }
 #endif // DEBUG
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
     // only full barrier needs to be emitted on Xarch
     if (barrierKind != BARRIER_FULL)
     {
@@ -2403,6 +2426,8 @@ void CodeGen::instGen_Set_Reg_To_Zero(emitAttr size, regNumber reg, insFlags fla
     GetEmitter()->emitIns_R_R(INS_xor, size, reg, reg);
 #elif defined(TARGET_ARMARCH)
     GetEmitter()->emitIns_R_I(INS_mov, size, reg, 0 ARM_ARG(flags));
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    GetEmitter()->emitIns_R_R(INS_xor, size, reg, reg);
 #else
 #error "Unknown TARGET"
 #endif
@@ -2420,6 +2445,8 @@ void CodeGen::instGen_Compare_Reg_To_Zero(emitAttr size, regNumber reg)
     GetEmitter()->emitIns_R_R(INS_test, size, reg, reg);
 #elif defined(TARGET_ARMARCH)
     GetEmitter()->emitIns_R_I(INS_cmp, size, reg, 0);
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    GetEmitter()->emitIns_R_R(INS_test, size, reg, reg);
 #else
 #error "Unknown TARGET"
 #endif
@@ -2432,7 +2459,7 @@ void CodeGen::instGen_Compare_Reg_To_Zero(emitAttr size, regNumber reg)
  */
 void CodeGen::instGen_Compare_Reg_To_Reg(emitAttr size, regNumber reg1, regNumber reg2)
 {
-#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
     GetEmitter()->emitIns_R_R(INS_cmp, size, reg1, reg2);
 #else
 #error "Unknown TARGET"
@@ -2452,7 +2479,7 @@ void CodeGen::instGen_Compare_Reg_To_Imm(emitAttr size, regNumber reg, target_ss
     }
     else
     {
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 #if defined(TARGET_AMD64)
         if ((EA_SIZE(size) == EA_8BYTE) && (((int)imm != (ssize_t)imm) || EA_IS_CNS_RELOC(size)))
         {
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index ed001fdc1bc7..dac28eef3436 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -47,6 +47,14 @@ enum instruction : unsigned
 
     INS_lea,   // Not a real instruction. It is used for load the address of stack locals
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#define INST0(id, nm, um, mr,                 flags) INS_##id,
+#define INST1(id, nm, um, mr,                 flags) INS_##id,
+#define INST2(id, nm, um, mr, mi,             flags) INS_##id,
+#define INST3(id, nm, um, mr, mi, rm,         flags) INS_##id,
+#define INST4(id, nm, um, mr, mi, rm, a4,     flags) INS_##id,
+#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) INS_##id,
+#include "instrs.h"
 #else
 #error Unsupported target architecture
 #endif
@@ -106,6 +114,19 @@ enum insFlags: unsigned
     INS_FLAGS_SET = 0x01,
     INS_FLAGS_DONT_CARE = 0x02,
 };
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO : can this be removed/empty?
+enum insFlags: uint8_t
+{
+    INS_FLAGS_None = 0x00,
+    INS_FLAGS_ReadsFlags = 0x01,
+    INS_FLAGS_WritesFlags = 0x02,
+    INS_FLAGS_x87Instr = 0x04,
+    INS_Flags_IsDstDstSrcAVXInstruction = 0x08,
+    INS_Flags_IsDstSrcSrcAVXInstruction = 0x10,
+
+    //  TODO-Cleanup:
+    INS_FLAGS_DONT_CARE = 0x00,
+};
 #else
 #error Unsupported target architecture
 #endif
diff --git a/src/coreclr/jit/instrs.h b/src/coreclr/jit/instrs.h
index b543f781645f..790984e94f85 100644
--- a/src/coreclr/jit/instrs.h
+++ b/src/coreclr/jit/instrs.h
@@ -7,6 +7,8 @@
 #include "instrsarm.h"
 #elif defined(TARGET_ARM64)
 #include "instrsarm64.h"
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#include "instrswasm.h"
 #else
 #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/instrswasm.h b/src/coreclr/jit/instrswasm.h
new file mode 100644
index 000000000000..a335394277f1
--- /dev/null
+++ b/src/coreclr/jit/instrswasm.h
@@ -0,0 +1,774 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+//
+//  This file was previously known as instrs.h
+//
+/*****************************************************************************
+ *  x86 instructions for  the JIT compiler
+ *
+ *          id      -- the enum name for the instruction
+ *          nm      -- textual name (for assembly dipslay)
+ *          um      -- update mode, see IUM_xx enum (rd, wr, or rw)
+ *          mr      -- base encoding for R/M[reg] addressing mode
+ *          mi      -- base encoding for R/M,icon addressing mode
+ *          rm      -- base encoding for reg,R/M  addressing mode
+ *          a4      -- base encoding for eax,i32  addressing mode
+ *          rr      -- base encoding for register addressing mode
+ *          flags   -- flags, see INS_FLAGS_* enum
+ *
+******************************************************************************/
+
+// clang-format off
+#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
+  #error Unexpected target type
+#endif
+
+#ifndef INST1
+#error  At least INST1 must be defined before including this file.
+#endif
+/*****************************************************************************/
+#ifndef INST0
+#define INST0(id, nm, um, mr,                 flags)
+#endif
+#ifndef INST2
+#define INST2(id, nm, um, mr, mi,             flags)
+#endif
+#ifndef INST3
+#define INST3(id, nm, um, mr, mi, rm,         flags)
+#endif
+#ifndef INST4
+#define INST4(id, nm, um, mr, mi, rm, a4,     flags)
+#endif
+#ifndef INST5
+#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags)
+#endif
+
+/*****************************************************************************/
+/*               The following is x86-specific                               */
+/*****************************************************************************/
+
+//    id                nm                  um      mr            mi            rm            a4            rr           flags
+INST5(invalid,          "INVALID",          IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,    INS_FLAGS_None)
+
+INST5(push,             "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None)
+INST5(pop,              "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None)
+// Does not affect the stack tracking in the emitter
+INST5(push_hide,        "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None)
+INST5(pop_hide,         "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None)
+
+INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000040,    INS_FLAGS_WritesFlags)
+INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    INS_FLAGS_WritesFlags)
+INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000048,    INS_FLAGS_WritesFlags)
+INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    INS_FLAGS_WritesFlags)
+
+// Multi-byte opcodes without modrm are represented in mixed endian fashion.
+// See comment around quarter way through this file for more information.
+INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_FLAGS_None)
+
+//    id                nm                  um      mr            mi            rm            a4                         flags
+INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_FLAGS_WritesFlags)
+INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_FLAGS_WritesFlags)
+INST4(adc,              "adc",              IUM_RW, 0x000010,     0x001080,     0x000012,     0x000014,                  INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST4(sbb,              "sbb",              IUM_RW, 0x000018,     0x001880,     0x00001A,     0x00001C,                  INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_FLAGS_WritesFlags)
+INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_FLAGS_WritesFlags)
+INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  INS_FLAGS_WritesFlags)
+INST4(cmp,              "cmp",              IUM_RD, 0x000038,     0x003880,     0x00003A,     0x00003C,                  INS_FLAGS_WritesFlags)
+INST4(test,             "test",             IUM_RD, 0x000084,     0x0000F6,     0x000084,     0x0000A8,                  INS_FLAGS_WritesFlags)
+INST4(mov,              "mov",              IUM_WR, 0x000088,     0x0000C6,     0x00008A,     0x0000B0,                  INS_FLAGS_None)
+
+INST4(lea,              "lea",              IUM_WR, BAD_CODE,     BAD_CODE,     0x00008D,     BAD_CODE,                  INS_FLAGS_None)
+
+//    id                nm                  um      mr            mi            rm                                       flags
+
+// Note that emitter has only partial support for BT. It can only emit the reg,reg form
+// and the registers need to be reversed to get the correct encoding.
+INST3(bt,               "bt",               IUM_RD, 0x0F00A3,     BAD_CODE,     0x0F00A3,                                INS_FLAGS_WritesFlags)
+
+INST3(bsf,              "bsf",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BC,                                INS_FLAGS_WritesFlags)
+INST3(bsr,              "bsr",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BD,                                INS_FLAGS_WritesFlags)
+
+INST3(movsx,            "movsx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BE,                                INS_FLAGS_None)
+#ifdef TARGET_AMD64
+INST3(movsxd,           "movsxd",           IUM_WR, BAD_CODE,     BAD_CODE,     0x4800000063,                            INS_FLAGS_None)
+#endif
+INST3(movzx,            "movzx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00B6,                                INS_FLAGS_None)
+
+INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                INS_FLAGS_ReadsFlags)
+INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                INS_FLAGS_ReadsFlags)
+INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                INS_FLAGS_ReadsFlags)
+INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                INS_FLAGS_ReadsFlags)
+INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                INS_FLAGS_ReadsFlags)
+INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                INS_FLAGS_ReadsFlags)
+INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                INS_FLAGS_ReadsFlags)
+INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                INS_FLAGS_ReadsFlags)
+INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                INS_FLAGS_ReadsFlags)
+INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                INS_FLAGS_ReadsFlags)
+INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                INS_FLAGS_ReadsFlags)
+INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                INS_FLAGS_ReadsFlags)
+INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                INS_FLAGS_ReadsFlags)
+INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                INS_FLAGS_ReadsFlags)
+INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                INS_FLAGS_ReadsFlags)
+INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                INS_FLAGS_ReadsFlags)
+
+INST3(xchg,             "xchg",             IUM_RW, 0x000086,     BAD_CODE,     0x000086,                                INS_FLAGS_None)
+INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                INS_FLAGS_WritesFlags) // op1 *= op2
+
+//    id                nm                  um      mr            mi            rm                                       flags
+
+// Instead of encoding these as 3-operand instructions, we encode them
+// as 2-operand instructions with the target register being implicit
+// implicit_reg = op1*op2_icon
+#define INSTMUL INST3
+INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
+
+#ifdef TARGET_AMD64
+
+INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                INS_FLAGS_WritesFlags)
+INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                INS_FLAGS_WritesFlags)
+
+#endif // TARGET_AMD64
+
+// the hex codes in this file represent the instruction encoding as follows:
+// 0x0000ff00 - modrm byte position
+// 0x000000ff - last byte of opcode (before modrm)
+// 0x00ff0000 - first byte of opcode
+// 0xff000000 - middle byte of opcode, if needed (after first, before last)
+//
+// So a 1-byte opcode is:      and with modrm:
+//             0x00000011          0x0000RM11
+//
+// So a 2-byte opcode is:      and with modrm:
+//             0x00002211          0x0011RM22
+//
+// So a 3-byte opcode is:      and with modrm:
+//             0x00113322          0x2211RM33
+//
+// So a 4-byte opcode would be something like this:
+//             0x22114433
+
+#define PACK3(byte1,byte2,byte3) (((byte1) << 16) | ((byte2) << 24) | (byte3))
+#define PACK2(byte1,byte2)                       (((byte1) << 16) | (byte2))
+#define SSEFLT(c) PACK3(0xf3, 0x0f, c)
+#define SSEDBL(c) PACK3(0xf2, 0x0f, c)
+#define PCKDBL(c) PACK3(0x66, 0x0f, c)
+#define PCKFLT(c) PACK2(0x0f,c)
+
+// These macros encode extra byte that is implicit in the macro.
+#define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
+#define SSE38(c)   PACK4(0x66, 0x0f, 0x38, c)
+#define SSE3A(c)   PACK4(0x66, 0x0f, 0x3A, c)
+
+// VEX* encodes the implied leading opcode bytes in c1:
+// 1: implied 0f, 2: implied 0f 38, 3: implied 0f 3a
+#define VEX2INT(c1,c2)   PACK3(c1, 0xc5, c2)
+#define VEX3INT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
+#define VEX3FLT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
+
+INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+// These are the SSE instructions used on x86
+INST3(mov_i2xmm,        "movd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6E),                            INS_FLAGS_None)    // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg
+INST3(mov_xmm2i,        "movd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7E),                            INS_FLAGS_None)    // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg
+INST3(pmovmskb,         "pmovmskb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD7),                            INS_FLAGS_None)    // Move the MSB bits of all bytes in a xmm reg to an int reg
+INST3(movmskpd,         "movmskpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x50),                            INS_FLAGS_None)    // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros.
+INST3(movd,             "movd",             IUM_WR, PCKDBL(0x7E), BAD_CODE,     PCKDBL(0x6E),                            INS_FLAGS_None)
+INST3(movq,             "movq",             IUM_WR, PCKDBL(0xD6), BAD_CODE,     SSEFLT(0x7E),                            INS_FLAGS_None)
+INST3(movsdsse2,        "movsd",            IUM_WR, SSEDBL(0x11), BAD_CODE,     SSEDBL(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+
+INST3(punpckldq,        "punpckldq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x62),                            INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(xorps,            "xorps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed singles
+
+INST3(cvttsd2si,        "cvttsd2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar double to signed DWORDs
+
+INST3(movntdq,          "movntdq",          IUM_WR, PCKDBL(0xE7), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(movnti,           "movnti",           IUM_WR, PCKFLT(0xC3), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(movntpd,          "movntpd",          IUM_WR, PCKDBL(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(movntps,          "movntps",          IUM_WR, PCKFLT(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(movdqu,           "movdqu",           IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_FLAGS_None)
+INST3(movdqa,           "movdqa",           IUM_WR, PCKDBL(0x7F), BAD_CODE,     PCKDBL(0x6F),                            INS_FLAGS_None)
+INST3(movlpd,           "movlpd",           IUM_WR, PCKDBL(0x13), BAD_CODE,     PCKDBL(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movlps,           "movlps",           IUM_WR, PCKFLT(0x13), BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movhpd,           "movhpd",           IUM_WR, PCKDBL(0x17), BAD_CODE,     PCKDBL(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movhps,           "movhps",           IUM_WR, PCKFLT(0x17), BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movss,            "movss",            IUM_WR, SSEFLT(0x11), BAD_CODE,     SSEFLT(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
+INST3(movapd,           "movapd",           IUM_WR, PCKDBL(0x29), BAD_CODE,     PCKDBL(0x28),                            INS_FLAGS_None)
+INST3(movaps,           "movaps",           IUM_WR, PCKFLT(0x29), BAD_CODE,     PCKFLT(0x28),                            INS_FLAGS_None)
+INST3(movupd,           "movupd",           IUM_WR, PCKDBL(0x11), BAD_CODE,     PCKDBL(0x10),                            INS_FLAGS_None)
+INST3(movups,           "movups",           IUM_WR, PCKFLT(0x11), BAD_CODE,     PCKFLT(0x10),                            INS_FLAGS_None)
+INST3(movhlps,          "movhlps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(movlhps,          "movlhps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(movmskps,         "movmskps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x50),                            INS_FLAGS_None)
+INST3(unpckhps,         "unpckhps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(unpcklps,         "unpcklps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(maskmovdqu,       "maskmovdqu",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF7),                            INS_FLAGS_None)
+
+INST3(shufps,           "shufps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
+INST3(shufpd,           "shufpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(punpckhdq,        "punpckhdq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6A),                            INS_Flags_IsDstDstSrcAVXInstruction)
+
+INST3(lfence,           "lfence",           IUM_RD, 0x000FE8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(mfence,           "mfence",           IUM_RD, 0x000FF0AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(prefetchnta,      "prefetchnta",      IUM_RD, 0x000F0018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(prefetcht0,       "prefetcht0",       IUM_RD, 0x000F0818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(prefetcht1,       "prefetcht1",       IUM_RD, 0x000F1018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(prefetcht2,       "prefetcht2",       IUM_RD, 0x000F1818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+INST3(sfence,           "sfence",           IUM_RD, 0x000FF8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
+
+// SSE 2 arith
+INST3(addps,            "addps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed singles
+INST3(addss,            "addss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar singles
+INST3(addpd,            "addpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed doubles
+INST3(addsd,            "addsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar doubles
+INST3(mulps,            "mulps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed singles
+INST3(mulss,            "mulss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar single
+INST3(mulpd,            "mulpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed doubles
+INST3(mulsd,            "mulsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar doubles
+INST3(subps,            "subps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed singles
+INST3(subss,            "subss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar singles
+INST3(subpd,            "subpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed doubles
+INST3(subsd,            "subsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar doubles
+INST3(minps,            "minps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed singles
+INST3(minss,            "minss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar single
+INST3(minpd,            "minpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed doubles
+INST3(minsd,            "minsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar double
+INST3(divps,            "divps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed singles
+INST3(divss,            "divss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar singles
+INST3(divpd,            "divpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed doubles
+INST3(divsd,            "divsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar doubles
+INST3(maxps,            "maxps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed singles
+INST3(maxss,            "maxss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar single
+INST3(maxpd,            "maxpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed doubles
+INST3(maxsd,            "maxsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar double
+INST3(xorpd,            "xorpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed doubles
+INST3(andps,            "andps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed singles
+INST3(andpd,            "andpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed doubles
+INST3(sqrtps,           "sqrtps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x51),                            INS_FLAGS_None)    // Sqrt of packed singles
+INST3(sqrtss,           "sqrtss",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar single
+INST3(sqrtpd,           "sqrtpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x51),                            INS_FLAGS_None)    // Sqrt of packed doubles
+INST3(sqrtsd,           "sqrtsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar double
+INST3(andnps,           "andnps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed singles
+INST3(andnpd,           "andnpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed doubles
+INST3(orps,             "orps",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed singles
+INST3(orpd,             "orpd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed doubles
+INST3(haddpd,           "haddpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed doubles
+INST3(haddps,           "haddps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed floats
+INST3(hsubpd,           "hsubpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed doubles
+INST3(hsubps,           "hsubps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed floats
+INST3(addsubps,         "addsubps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed singles
+INST3(addsubpd,         "addsubpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed doubles
+
+// SSE 2 approx arith
+INST3(rcpps,            "rcpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x53),                            INS_FLAGS_None)    // Reciprocal of packed singles
+INST3(rcpss,            "rcpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x53),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal of scalar single
+INST3(rsqrtps,          "rsqrtps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x52),                            INS_FLAGS_None)    // Reciprocal Sqrt of packed singles
+INST3(rsqrtss,          "rsqrtss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x52),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal Sqrt of scalar single
+
+// SSE2 conversions
+INST3(cvtpi2ps,         "cvtpi2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
+INST3(cvtsi2ss,         "cvtsi2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar single
+INST3(cvtpi2pd,         "cvtpi2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
+INST3(cvtsi2sd,         "cvtsi2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar double
+INST3(cvttps2pi,        "cvttps2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
+INST3(cvttss2si,        "cvttss2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar single to DWORD
+INST3(cvttpd2pi,        "cvttpd2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
+INST3(cvtps2pi,         "cvtps2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2D),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
+INST3(cvtss2si,         "cvtss2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2D),                            INS_FLAGS_None)    // cvt scalar single to DWORD
+INST3(cvtpd2pi,         "cvtpd2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2D),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
+INST3(cvtsd2si,         "cvtsd2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2D),                            INS_FLAGS_None)    // cvt scalar double to DWORD
+INST3(cvtps2pd,         "cvtps2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5A),                            INS_FLAGS_None)    // cvt packed singles to doubles
+INST3(cvtpd2ps,         "cvtpd2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5A),                            INS_FLAGS_None)    // cvt packed doubles to singles
+INST3(cvtss2sd,         "cvtss2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar single to scalar doubles
+INST3(cvtsd2ss,         "cvtsd2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar double to scalar singles
+INST3(cvtdq2ps,         "cvtdq2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5B),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
+INST3(cvtps2dq,         "cvtps2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5B),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
+INST3(cvttps2dq,        "cvttps2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5B),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
+INST3(cvtpd2dq,         "cvtpd2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xE6),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
+INST3(cvttpd2dq,        "cvttpd2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE6),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
+INST3(cvtdq2pd,         "cvtdq2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xE6),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
+
+// SSE2 comparison instructions
+INST3(comiss,           "comiss",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2F),                            INS_FLAGS_None)    // ordered compare singles
+INST3(comisd,           "comisd",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2F),                            INS_FLAGS_None)    // ordered compare doubles
+INST3(ucomiss,          "ucomiss",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2E),                            INS_FLAGS_None)    // unordered compare singles
+INST3(ucomisd,          "ucomisd",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2E),                            INS_FLAGS_None)    // unordered compare doubles
+
+// SSE2 packed single/double comparison operations.
+// Note that these instructions not only compare but also overwrite the first source.
+INST3(cmpps,            "cmpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed singles
+INST3(cmppd,            "cmppd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed doubles
+INST3(cmpss,            "cmpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar singles
+INST3(cmpsd,            "cmpsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar doubles
+
+//SSE2 packed integer operations
+INST3(paddb,            "paddb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed byte integers
+INST3(paddw,            "paddw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed word (16-bit) integers
+INST3(paddd,            "paddd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed double-word (32-bit) integers
+INST3(paddq,            "paddq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed quad-word (64-bit) integers
+INST3(paddsb,           "paddsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed byte integers and saturate the results
+INST3(paddsw,           "paddsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xED),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed word integers and saturate the results
+INST3(paddusb,          "paddusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned byte integers and saturate the results
+INST3(paddusw,          "paddusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned word integers and saturate the results
+INST3(pavgb,            "pavgb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed byte integers
+INST3(pavgw,            "pavgw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed word integers
+INST3(psubb,            "psubb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
+INST3(psubw,            "psubw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
+INST3(psubd,            "psubd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed double-word (32-bit) integers
+INST3(psubq,            "psubq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // subtract packed quad-word (64-bit) integers
+INST3(pmaddwd,          "pmaddwd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst
+INST3(pmulhw,           "pmulhw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit signed integers
+INST3(pmulhuw,          "pmulhuw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit unsigned integers
+INST3(pmuludq,          "pmuludq",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit unsigned integers and store 64-bit result
+INST3(pmullw,           "pmullw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result
+INST3(pand,             "pand",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND of two xmm regs
+INST3(pandn,            "pandn",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND NOT of two xmm regs
+INST3(por,              "por",              IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise OR of two xmm regs
+INST3(pxor,             "pxor",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise XOR of two xmm regs
+INST3(psadbw,           "psadbw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF6),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Compute the sum of absolute differences of packed unsigned 8-bit integers
+INST3(psubsb,           "psubsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation
+INST3(psubusb,          "psubusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation
+INST3(psubsw,           "psubsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation
+INST3(psubusw,          "psubusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation
+
+// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
+// which is handled in emitxarch.cpp.
+INST3(psrldq,           "psrldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift right logical of xmm reg by given number of bytes
+INST3(pslldq,           "pslldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift left logical of xmm reg by given number of bytes
+INST3(psllw,            "psllw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xF1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 16-bit integers
+INST3(pslld,            "pslld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xF2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 32-bit integers
+INST3(psllq,            "psllq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xF3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 64-bit integers
+INST3(psrlw,            "psrlw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xD1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 16-bit integers
+INST3(psrld,            "psrld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xD2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 32-bit integers
+INST3(psrlq,            "psrlq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xD3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 64-bit integers
+INST3(psraw,            "psraw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xE1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 16-bit integers
+INST3(psrad,            "psrad",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xE2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 32-bit integers
+
+INST3(pmaxub,           "pmaxub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum unsigned bytes
+INST3(pminub,           "pminub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum unsigned bytes
+INST3(pmaxsw,           "pmaxsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed words
+INST3(pminsw,           "pminsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed words
+INST3(pcmpeqd,          "pcmpeqd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x76),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit integers for equality
+INST3(pcmpgtd,          "pcmpgtd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x66),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit signed integers for greater than
+INST3(pcmpeqw,          "pcmpeqw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x75),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit integers for equality
+INST3(pcmpgtw,          "pcmpgtw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x65),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit signed integers for greater than
+INST3(pcmpeqb,          "pcmpeqb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x74),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit integers for equality
+INST3(pcmpgtb,          "pcmpgtb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x64),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit signed integers for greater than
+
+INST3(pshufd,           "pshufd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x70),                            INS_FLAGS_None)    // Packed shuffle of 32-bit integers
+INST3(pshufhw,          "pshufhw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x70),                            INS_FLAGS_None)    // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
+INST3(pshuflw,          "pshuflw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x70),                            INS_FLAGS_None)    // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
+INST3(pextrw,           "pextrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC5),                            INS_FLAGS_None)    // Extract 16-bit value into a r32 with zero extended to 32-bits
+INST3(pinsrw,           "pinsrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Insert word at index
+
+INST3(punpckhbw,        "punpckhbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x68),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3(punpcklbw,        "punpcklbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x60),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (lo)
+INST3(punpckhqdq,       "punpckhqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (hi)
+INST3(punpcklqdq,       "punpcklqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (lo)
+INST3(punpckhwd,        "punpckhwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x69),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (hi)
+INST3(punpcklwd,        "punpcklwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x61),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (lo)
+INST3(unpckhpd,         "unpckhpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+INST3(unpcklpd,         "unpcklpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
+
+INST3(packssdw,         "packssdw",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6B),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to short with saturation
+INST3(packsswb,         "packsswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x63),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to byte with saturation
+INST3(packuswb,         "packuswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x67),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to unsigned byte with saturation
+
+//    id                nm                  um      mr            mi            rm                                       flags
+INST3(dpps,             "dpps",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two float vector regs
+INST3(dppd,             "dppd",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x41),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two double vector regs
+INST3(insertps,         "insertps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x21),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert packed single precision float value
+INST3(pcmpeqq,          "pcmpeqq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x29),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
+INST3(pcmpgtq,          "pcmpgtq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x37),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
+INST3(pmulld,           "pmulld",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
+INST3(ptest,            "ptest",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x17),                             INS_FLAGS_None)    // Packed logical compare
+INST3(phaddd,           "phaddd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add
+INST3(pabsb,            "pabsb",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1C),                             INS_FLAGS_None)    // Packed absolute value of bytes
+INST3(pabsw,            "pabsw",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1D),                             INS_FLAGS_None)    // Packed absolute value of 16-bit integers
+INST3(pabsd,            "pabsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1E),                             INS_FLAGS_None)    // Packed absolute value of 32-bit integers
+INST3(palignr,          "palignr",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Align Right
+INST3(pmaddubsw,        "pmaddubsw",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x04),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Packed Signed and Unsigned Bytes
+INST3(pmulhrsw,         "pmulhrsw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Multiply High with Round and Scale
+INST3(pshufb,           "pshufb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x00),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Shuffle Bytes
+INST3(psignb,           "psignb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x08),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(psignw,           "psignw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x09),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(psignd,           "psignd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
+INST3(pminsb,           "pminsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed bytes
+INST3(pminsd,           "pminsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x39),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit signed integers
+INST3(pminuw,           "pminuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 16-bit unsigned integers
+INST3(pminud,           "pminud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit unsigned integers
+INST3(pmaxsb,           "pmaxsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed bytes
+INST3(pmaxsd,           "pmaxsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit signed integers
+INST3(pmaxuw,           "pmaxuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 16-bit unsigned integers
+INST3(pmaxud,           "pmaxud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit unsigned integers
+INST3(pmovsxbw,         "pmovsxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x20),                             INS_FLAGS_None)    // Packed sign extend byte to short
+INST3(pmovsxbd,         "pmovsxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x21),                             INS_FLAGS_None)    // Packed sign extend byte to int
+INST3(pmovsxbq,         "pmovsxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x22),                             INS_FLAGS_None)    // Packed sign extend byte to long
+INST3(pmovsxwd,         "pmovsxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x23),                             INS_FLAGS_None)    // Packed sign extend short to int
+INST3(pmovsxwq,         "pmovsxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x24),                             INS_FLAGS_None)    // Packed sign extend short to long
+INST3(pmovsxdq,         "pmovsxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x25),                             INS_FLAGS_None)    // Packed sign extend int to long
+INST3(pmovzxbw,         "pmovzxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x30),                             INS_FLAGS_None)    // Packed zero extend byte to short
+INST3(pmovzxbd,         "pmovzxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x31),                             INS_FLAGS_None)    // Packed zero extend byte to intg
+INST3(pmovzxbq,         "pmovzxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x32),                             INS_FLAGS_None)    // Packed zero extend byte to lon
+INST3(pmovzxwd,         "pmovzxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x33),                             INS_FLAGS_None)    // Packed zero extend short to int
+INST3(pmovzxwq,         "pmovzxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x34),                             INS_FLAGS_None)    // Packed zero extend short to long
+INST3(pmovzxdq,         "pmovzxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x35),                             INS_FLAGS_None)    // Packed zero extend int to long
+INST3(packusdw,         "packusdw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to unsigned short with saturation
+INST3(roundps,          "roundps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x08),                             INS_FLAGS_None)    // Round packed single precision floating-point values
+INST3(roundss,          "roundss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0A),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar single precision floating-point values
+INST3(roundpd,          "roundpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x09),                             INS_FLAGS_None)    // Round packed double precision floating-point values
+INST3(roundsd,          "roundsd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0B),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar double precision floating-point values
+INST3(pmuldq,           "pmuldq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x28),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit signed integers and store 64-bit result
+INST3(blendps,          "blendps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Single Precision Floating-Point Values
+INST3(blendvps,         "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x14),                             INS_FLAGS_None)    // Variable Blend Packed Singles
+INST3(blendpd,          "blendpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Double Precision Floating-Point Values
+INST3(blendvpd,         "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x15),                             INS_FLAGS_None)    // Variable Blend Packed Doubles
+INST3(pblendw,          "pblendw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Words
+INST3(pblendvb,         "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x10),                             INS_FLAGS_None)    // Variable Blend Packed Bytes
+INST3(phaddw,           "phaddw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x01),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers
+INST3(phsubw,           "phsubw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x05),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers
+INST3(phsubd,           "phsubd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 32-bit integers
+INST3(phaddsw,          "phaddsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x03),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers with saturation
+INST3(phsubsw,          "phsubsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x07),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers with saturation
+INST3(lddqu,            "lddqu",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xF0),                            INS_FLAGS_None)    // Load Unaligned integer
+INST3(movntdqa,         "movntdqa",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2A),                             INS_FLAGS_None)    // Load Double Quadword Non-Temporal Aligned Hint
+INST3(movddup,          "movddup",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x12),                            INS_FLAGS_None)    // Replicate Double FP Values
+INST3(movsldup,         "movsldup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x12),                            INS_FLAGS_None)    // Replicate even-indexed Single FP Values
+INST3(movshdup,         "movshdup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x16),                            INS_FLAGS_None)    // Replicate odd-indexed Single FP Values
+INST3(phminposuw,       "phminposuw",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x41),                             INS_FLAGS_None)    // Packed Horizontal Word Minimum
+INST3(mpsadbw,          "mpsadbw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x42),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Compute Multiple Packed Sums of Absolute Difference
+INST3(pinsrb,           "pinsrb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x20),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Byte
+INST3(pinsrd,           "pinsrd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Dword
+INST3(pinsrq,           "pinsrq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Qword
+INST3(pextrb,           "pextrb",           IUM_WR, SSE3A(0x14),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Byte
+INST3(pextrd,           "pextrd",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Dword
+INST3(pextrq,           "pextrq",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Qword
+INST3(pextrw_sse41,     "pextrw",           IUM_WR, SSE3A(0x15),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Word
+INST3(extractps,        "extractps",        IUM_WR, SSE3A(0x17),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Packed Floating-Point Values
+
+//PCLMULQDQ instructions
+INST3(pclmulqdq,        "pclmulqdq" ,       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform a carry-less multiplication of two quadwords
+
+//AES instructions
+INST3(aesdec,           "aesdec",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES decryption flow
+INST3(aesdeclast,       "aesdeclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES decryption flow
+INST3(aesenc,           "aesenc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES encryption flow
+INST3(aesenclast,       "aesenclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES encryption flow
+INST3(aesimc,           "aesimc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDB),                             INS_FLAGS_None)   // Perform the AES InvMixColumn Transformation
+INST3(aeskeygenassist,  "aeskeygenassist",  IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xDF),                             INS_FLAGS_None)   // AES Round Key Generation Assist
+INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+
+INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+// AVX only instructions
+INST3(vbroadcastss,     "broadcastss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x18),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
+INST3(vbroadcastsd,     "broadcastsd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x19),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
+INST3(vpbroadcastb,     "pbroadcastb",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x78),                             INS_FLAGS_None)    // Broadcast int8 value from reg/memory to entire ymm register
+INST3(vpbroadcastw,     "pbroadcastw",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x79),                             INS_FLAGS_None)    // Broadcast int16 value from reg/memory to entire ymm register
+INST3(vpbroadcastd,     "pbroadcastd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x58),                             INS_FLAGS_None)    // Broadcast int32 value from reg/memory to entire ymm register
+INST3(vpbroadcastq,     "pbroadcastq",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x59),                             INS_FLAGS_None)    // Broadcast int64 value from reg/memory to entire ymm register
+INST3(vextractf128,     "extractf128",      IUM_WR, SSE3A(0x19),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed floating point values
+INST3(vextracti128,     "extracti128",      IUM_WR, SSE3A(0x39),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed integer values
+INST3(vinsertf128,      "insertf128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x18),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed floating point values
+INST3(vinserti128,      "inserti128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed integer values
+INST3(vzeroupper,       "zeroupper",        IUM_WR, 0xC577F8,     BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
+INST3(vperm2i128,       "perm2i128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute 128-bit halves of input register
+INST3(vpermq,           "permq",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x00),                             INS_FLAGS_None)    // Permute 64-bit of input register
+INST3(vpblendd,         "pblendd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed DWORDs
+INST3(vblendvps,        "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Singles
+INST3(vblendvpd,        "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Doubles
+INST3(vpblendvb,        "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Bytes
+INST3(vtestps,          "testps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0E),                             INS_FLAGS_None)    // Packed Bit Test
+INST3(vtestpd,          "testpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0F),                             INS_FLAGS_None)    // Packed Bit Test
+INST3(vpsrlvd,          "psrlvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
+INST3(vpsrlvq,          "psrlvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
+INST3(vpsravd,          "psravd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Arithmetic
+INST3(vpsllvd,          "psllvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
+INST3(vpsllvq,          "psllvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
+INST3(vpermilps,        "permilps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x04),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
+INST3(vpermilpd,        "permilpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x05),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
+INST3(vpermilpsvar,     "permilpsvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
+INST3(vpermilpdvar,     "permilpdvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
+INST3(vperm2f128,       "perm2f128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Floating-Point Values
+INST3(vpermpd,          "permpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x01),                             INS_FLAGS_None)    // Permute Double-Precision Floating-Point Values
+INST3(vpermd,           "permd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x36),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Packed Doublewords Elements
+INST3(vpermps,          "permps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x16),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Single-Precision Floating-Point Elements
+INST3(vbroadcastf128,   "broadcastf128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1A),                             INS_FLAGS_None)    // Broadcast packed float values read from memory to entire ymm register
+INST3(vbroadcasti128,   "broadcasti128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x5A),                             INS_FLAGS_None)    // Broadcast packed integer values read from memory to entire ymm register
+INST3(vmaskmovps,       "maskmovps",        IUM_WR, SSE38(0x2E),  BAD_CODE,     SSE38(0x2C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores
+INST3(vmaskmovpd,       "maskmovpd",        IUM_WR, SSE38(0x2F),  BAD_CODE,     SSE38(0x2D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores
+INST3(vpmaskmovd,       "pmaskmovd",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Dword Loads and Stores
+INST3(vpmaskmovq,       "pmaskmovq",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Qword Loads and Stores
+INST3(vpgatherdd,       "pgatherdd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Dword
+INST3(vpgatherqd,       "pgatherqd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Qword
+INST3(vpgatherdq,       "pgatherdq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword with Signed Dword Indices
+INST3(vpgatherqq,       "pgatherqq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Qword with Signed Dword Indices
+INST3(vgatherdps,       "gatherdps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Dword Indices
+INST3(vgatherqps,       "gatherqps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Qword Indices
+INST3(vgatherdpd,       "gatherdpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Dword Indices
+INST3(vgatherqpd,       "gatherqpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Qword Indices
+
+INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+//    id                nm                  um      mr            mi            rm                                       flags
+INST3(vfmadd132pd,      "fmadd132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfmadd213pd,      "fmadd213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231pd,      "fmadd231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132ps,      "fmadd132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfmadd213ps,      "fmadd213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231ps,      "fmadd231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132sd,      "fmadd132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfmadd213sd,      "fmadd213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231sd,      "fmadd231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd132ss,      "fmadd132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfmadd213ss,      "fmadd213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmadd231ss,      "fmadd231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub132pd,   "fmaddsub132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmaddsub213pd,   "fmaddsub213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub231pd,   "fmaddsub231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub132ps,   "fmaddsub132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmaddsub213ps,   "fmaddsub213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmaddsub231ps,   "fmaddsub231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd132pd,   "fmsubadd132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values
+INST3(vfmsubadd213pd,   "fmsubadd213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd231pd,   "fmsubadd231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd132ps,   "fmsubadd132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values
+INST3(vfmsubadd213ps,   "fmsubadd213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsubadd231ps,   "fmsubadd231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132pd,      "fmsub132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfmsub213pd,      "fmsub213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231pd,      "fmsub231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132ps,      "fmsub132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfmsub213ps,      "fmsub213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231ps,      "fmsub231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132sd,      "fmsub132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfmsub213sd,      "fmsub213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231sd,      "fmsub231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub132ss,      "fmsub132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfmsub213ss,      "fmsub213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfmsub231ss,      "fmsub231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132pd,     "fnmadd132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
+INST3(vfnmadd213pd,     "fnmadd213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231pd,     "fnmadd231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132ps,     "fnmadd132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
+INST3(vfnmadd213ps,     "fnmadd213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231ps,     "fnmadd231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132sd,     "fnmadd132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
+INST3(vfnmadd213sd,     "fnmadd213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231sd,     "fnmadd231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd132ss,     "fnmadd132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
+INST3(vfnmadd213ss,     "fnmadd213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmadd231ss,     "fnmadd231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132pd,     "fnmsub132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
+INST3(vfnmsub213pd,     "fnmsub213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231pd,     "fnmsub231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132ps,     "fnmsub132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
+INST3(vfnmsub213ps,     "fnmsub213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231ps,     "fnmsub231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132sd,     "fnmsub132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
+INST3(vfnmsub213sd,     "fnmsub213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231sd,     "fnmsub231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub132ss,     "fnmsub132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
+INST3(vfnmsub213ss,     "fnmsub213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(vfnmsub231ss,     "fnmsub231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
+INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+
+// BMI1
+INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Logical AND NOT
+INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Extract Lowest Set Isolated Bit
+INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Get Mask Up to Lowest Set Bit
+INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Reset Lowest Set Bit
+INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Bit Field Extract
+
+// BMI2
+INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             INS_FLAGS_None)
+INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Parallel Bits Deposit
+INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Parallel Bits Extract
+INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Zero High Bits Starting with Specified Bit Position
+INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Unsigned Multiply Without Affecting Flags
+
+INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+
+INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+
+// Scalar instructions in SSE4.2
+INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_FLAGS_None)
+
+// BMI1
+INST3(tzcnt,            "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBC),                            INS_FLAGS_None)    // Count the Number of Trailing Zero Bits
+
+// LZCNT
+INST3(lzcnt,            "lzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBD),                            INS_FLAGS_None)
+
+// POPCNT
+INST3(popcnt,           "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xB8),                            INS_FLAGS_None)
+
+//    id                nm                  um      mr            mi                                                     flags
+INST2(ret,              "ret",              IUM_RD, 0x0000C3,     0x0000C2,                                              INS_FLAGS_None)
+INST2(loop,             "loop",             IUM_RD, BAD_CODE,     0x0000E2,                                              INS_FLAGS_None)
+INST2(call,             "call",             IUM_RD, 0x0010FF,     0x0000E8,                                              INS_FLAGS_WritesFlags)
+
+INST2(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
+INST2(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,                                              INS_FLAGS_WritesFlags)
+INST2(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,                                              INS_FLAGS_WritesFlags)
+INST2(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
+INST2(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,                                              INS_FLAGS_WritesFlags)
+INST2(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,                                              INS_FLAGS_WritesFlags)
+
+INST2(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
+INST2(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
+INST2(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,                                              INS_FLAGS_WritesFlags)
+INST2(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,                                              INS_FLAGS_WritesFlags)
+INST2(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
+INST2(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,                                              INS_FLAGS_WritesFlags)
+INST2(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,                                              INS_FLAGS_WritesFlags)
+INST2(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
+INST2(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,                                              INS_FLAGS_WritesFlags)
+INST2(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,                                              INS_FLAGS_WritesFlags)
+
+
+//    id                nm                  um      mr                                                                   flags
+INST1(r_movsb,          "rep movsb",        IUM_RD, 0x00A4F3,                                                            INS_FLAGS_None)
+INST1(r_movsd,          "rep movsd",        IUM_RD, 0x00A5F3,                                                            INS_FLAGS_None)
+#if defined(TARGET_AMD64)
+INST1(r_movsq,          "rep movsq",        IUM_RD, 0xF3A548,                                                            INS_FLAGS_None)
+#endif // defined(TARGET_AMD64)
+INST1(movsb,            "movsb",            IUM_RD, 0x0000A4,                                                            INS_FLAGS_None)
+INST1(movsd,            "movsd",            IUM_RD, 0x0000A5,                                                            INS_FLAGS_None)
+#if defined(TARGET_AMD64)
+INST1(movsq,            "movsq",            IUM_RD, 0x00A548,                                                            INS_FLAGS_None)
+#endif // defined(TARGET_AMD64)
+
+INST1(r_stosb,          "rep stosb",        IUM_RD, 0x00AAF3,                                                            INS_FLAGS_None)
+INST1(r_stosd,          "rep stosd",        IUM_RD, 0x00ABF3,                                                            INS_FLAGS_None)
+#if defined(TARGET_AMD64)
+INST1(r_stosq,          "rep stosq",        IUM_RD, 0xF3AB48,                                                            INS_FLAGS_None)
+#endif // defined(TARGET_AMD64)
+INST1(stosb,            "stosb",            IUM_RD, 0x0000AA,                                                            INS_FLAGS_None)
+INST1(stosd,            "stosd",            IUM_RD, 0x0000AB,                                                            INS_FLAGS_None)
+#if defined(TARGET_AMD64)
+INST1(stosq,            "stosq",            IUM_RD, 0x00AB48,                                                            INS_FLAGS_None)
+#endif // defined(TARGET_AMD64)
+
+INST1(int3,             "int3",             IUM_RD, 0x0000CC,                                                            INS_FLAGS_None)
+INST1(nop,              "nop",              IUM_RD, 0x000090,                                                            INS_FLAGS_None)
+INST1(lock,             "lock",             IUM_RD, 0x0000F0,                                                            INS_FLAGS_None)
+INST1(leave,            "leave",            IUM_RD, 0x0000C9,                                                            INS_FLAGS_None)
+
+
+INST1(neg,              "neg",              IUM_RW, 0x0018F6,                                                            INS_FLAGS_WritesFlags)
+INST1(not,              "not",              IUM_RW, 0x0010F6,                                                            INS_FLAGS_WritesFlags)
+
+INST1(cdq,              "cdq",              IUM_RD, 0x000099,                                                            INS_FLAGS_WritesFlags)
+INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            INS_FLAGS_WritesFlags)
+INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            INS_FLAGS_WritesFlags) // edx:eax = eax*op1
+INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            INS_FLAGS_WritesFlags)
+INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            INS_FLAGS_WritesFlags)
+
+INST1(sahf,             "sahf",             IUM_RD, 0x00009E,                                                            INS_FLAGS_WritesFlags)
+
+INST1(xadd,             "xadd",             IUM_RW, 0x0F00C0,                                                            INS_FLAGS_WritesFlags)
+INST1(cmpxchg,          "cmpxchg",          IUM_RW, 0x0F00B0,                                                            INS_FLAGS_WritesFlags)
+
+INST1(shld,             "shld",             IUM_RW, 0x0F00A4,                                                            INS_FLAGS_WritesFlags)
+INST1(shrd,             "shrd",             IUM_RW, 0x0F00AC,                                                            INS_FLAGS_WritesFlags)
+
+// For RyuJIT/x86, we follow the x86 calling convention that requires
+// us to return floating point value on the x87 FP stack, so we need
+// these instructions regardless of whether we're using full stack fp.
+#ifdef TARGET_X86
+INST1(fld,              "fld",              IUM_WR, 0x0000D9,                                                            INS_FLAGS_x87Instr)
+INST1(fstp,             "fstp",             IUM_WR, 0x0018D9,                                                            INS_FLAGS_x87Instr)
+#endif // TARGET_X86
+
+INST1(seto,             "seto",             IUM_WR, 0x0F0090,                                                            INS_FLAGS_ReadsFlags)
+INST1(setno,            "setno",            IUM_WR, 0x0F0091,                                                            INS_FLAGS_ReadsFlags)
+INST1(setb,             "setb",             IUM_WR, 0x0F0092,                                                            INS_FLAGS_ReadsFlags)
+INST1(setae,            "setae",            IUM_WR, 0x0F0093,                                                            INS_FLAGS_ReadsFlags)
+INST1(sete,             "sete",             IUM_WR, 0x0F0094,                                                            INS_FLAGS_ReadsFlags)
+INST1(setne,            "setne",            IUM_WR, 0x0F0095,                                                            INS_FLAGS_ReadsFlags)
+INST1(setbe,            "setbe",            IUM_WR, 0x0F0096,                                                            INS_FLAGS_ReadsFlags)
+INST1(seta,             "seta",             IUM_WR, 0x0F0097,                                                            INS_FLAGS_ReadsFlags)
+INST1(sets,             "sets",             IUM_WR, 0x0F0098,                                                            INS_FLAGS_ReadsFlags)
+INST1(setns,            "setns",            IUM_WR, 0x0F0099,                                                            INS_FLAGS_ReadsFlags)
+INST1(setp,             "setp",             IUM_WR, 0x0F009A,                                                            INS_FLAGS_ReadsFlags)
+INST1(setnp,            "setnp",            IUM_WR, 0x0F009B,                                                            INS_FLAGS_ReadsFlags)
+INST1(setl,             "setl",             IUM_WR, 0x0F009C,                                                            INS_FLAGS_ReadsFlags)
+INST1(setge,            "setge",            IUM_WR, 0x0F009D,                                                            INS_FLAGS_ReadsFlags)
+INST1(setle,            "setle",            IUM_WR, 0x0F009E,                                                            INS_FLAGS_ReadsFlags)
+INST1(setg,             "setg",             IUM_WR, 0x0F009F,                                                            INS_FLAGS_ReadsFlags)
+
+#ifdef TARGET_AMD64
+// A jump with rex prefix. This is used for register indirect
+// tail calls.
+INST1(rex_jmp,          "rex.jmp",          IUM_RD, 0x0020FE,                                                            INS_FLAGS_None)
+#endif
+
+INST1(i_jmp,            "jmp",              IUM_RD, 0x0020FE,                                                            INS_FLAGS_None)
+
+INST0(jmp,              "jmp",              IUM_RD, 0x0000EB,                                                            INS_FLAGS_None)
+INST0(jo,               "jo",               IUM_RD, 0x000070,                                                            INS_FLAGS_ReadsFlags)
+INST0(jno,              "jno",              IUM_RD, 0x000071,                                                            INS_FLAGS_ReadsFlags)
+INST0(jb,               "jb",               IUM_RD, 0x000072,                                                            INS_FLAGS_ReadsFlags)
+INST0(jae,              "jae",              IUM_RD, 0x000073,                                                            INS_FLAGS_ReadsFlags)
+INST0(je,               "je",               IUM_RD, 0x000074,                                                            INS_FLAGS_ReadsFlags)
+INST0(jne,              "jne",              IUM_RD, 0x000075,                                                            INS_FLAGS_ReadsFlags)
+INST0(jbe,              "jbe",              IUM_RD, 0x000076,                                                            INS_FLAGS_ReadsFlags)
+INST0(ja,               "ja",               IUM_RD, 0x000077,                                                            INS_FLAGS_ReadsFlags)
+INST0(js,               "js",               IUM_RD, 0x000078,                                                            INS_FLAGS_ReadsFlags)
+INST0(jns,              "jns",              IUM_RD, 0x000079,                                                            INS_FLAGS_ReadsFlags)
+INST0(jp,               "jp",               IUM_RD, 0x00007A,                                                            INS_FLAGS_ReadsFlags)
+INST0(jnp,              "jnp",              IUM_RD, 0x00007B,                                                            INS_FLAGS_ReadsFlags)
+INST0(jl,               "jl",               IUM_RD, 0x00007C,                                                            INS_FLAGS_ReadsFlags)
+INST0(jge,              "jge",              IUM_RD, 0x00007D,                                                            INS_FLAGS_ReadsFlags)
+INST0(jle,              "jle",              IUM_RD, 0x00007E,                                                            INS_FLAGS_ReadsFlags)
+INST0(jg,               "jg",               IUM_RD, 0x00007F,                                                            INS_FLAGS_ReadsFlags)
+
+INST0(l_jmp,            "jmp",              IUM_RD, 0x0000E9,                                                            INS_FLAGS_None)
+INST0(l_jo,             "jo",               IUM_RD, 0x00800F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jno,            "jno",              IUM_RD, 0x00810F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jb,             "jb",               IUM_RD, 0x00820F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jae,            "jae",              IUM_RD, 0x00830F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_je,             "je",               IUM_RD, 0x00840F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jne,            "jne",              IUM_RD, 0x00850F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jbe,            "jbe",              IUM_RD, 0x00860F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_ja,             "ja",               IUM_RD, 0x00870F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_js,             "js",               IUM_RD, 0x00880F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jns,            "jns",              IUM_RD, 0x00890F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jp,             "jp",               IUM_RD, 0x008A0F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jnp,            "jnp",              IUM_RD, 0x008B0F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jl,             "jl",               IUM_RD, 0x008C0F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jge,            "jge",              IUM_RD, 0x008D0F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jle,            "jle",              IUM_RD, 0x008E0F,                                                            INS_FLAGS_ReadsFlags)
+INST0(l_jg,             "jg",               IUM_RD, 0x008F0F,                                                            INS_FLAGS_ReadsFlags)
+
+INST0(align,            "align",            IUM_RD, BAD_CODE,                                                            INS_FLAGS_None)
+
+/*****************************************************************************/
+#undef  INST0
+#undef  INST1
+#undef  INST2
+#undef  INST3
+#undef  INST4
+#undef  INST5
+/*****************************************************************************/
+
+// clang-format on
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index 62e7ac8059b1..e4d4ac9360aa 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -147,6 +147,7 @@
 #if !defined(HOST_ARM64)
 #define _CROSS_COMPILER_
 #endif
+#elif defined(TARGET_WASM32)
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -194,6 +195,8 @@
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARMNT
 #elif defined(TARGET_ARM64)
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARM64 // 0xAA64
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_AMD64 // TODO: what is this?
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -239,10 +242,11 @@
 #define UNIX_AMD64_ABI_ONLY(x)
 #endif // defined(UNIX_AMD64_ABI)
 
-#if defined(DEBUG) && !defined(OSX_ARM64_ABI)
+#if defined(DEBUG) && !defined(OSX_ARM64_ABI)  && !defined(TARGET_WASM32)
 // On all platforms except Arm64 OSX arguments on the stack are taking
 // register size slots. On these platforms we could check that stack slots count
 // matchs out new byte size calculations.
+// For Wasm32 doubles are 8 bytes so can't be asserted against the size of a "register"
 #define DEBUG_ARG_SLOTS
 #endif
 
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index a9abde124fc3..70e54fc59005 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5700,7 +5700,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum,
 
 #if defined(TARGET_X86)
         argOffs += TARGET_POINTER_SIZE;
-#elif defined(TARGET_AMD64)
+#elif defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         // Register arguments on AMD64 also takes stack space. (in the backing store)
         varDsc->SetStackOffset(argOffs);
         argOffs += TARGET_POINTER_SIZE;
diff --git a/src/coreclr/jit/liveness.cpp b/src/coreclr/jit/liveness.cpp
index d719970adeac..3d3c5481de28 100644
--- a/src/coreclr/jit/liveness.cpp
+++ b/src/coreclr/jit/liveness.cpp
@@ -1027,7 +1027,7 @@ void Compiler::fgExtendDbgLifetimes()
                     LIR::Range initRange = LIR::EmptyRange();
                     initRange.InsertBefore(nullptr, zero, store);
 
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
                     DecomposeLongs::DecomposeRange(this, initRange);
 #endif // !defined(TARGET_64BIT)
                     m_pLowering->LowerRange(block, initRange);
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 7bd60f3f21a2..7d4725cb3ccd 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -1367,7 +1367,9 @@ void Lowering::LowerArg(GenTreeCall* call, GenTree** ppArg)
             // For longs, we will replace the GT_LONG with a GT_FIELD_LIST, and put that under a PUTARG_STK.
             // Although the hi argument needs to be pushed first, that will be handled by the general case,
             // in which the fields will be reversed.
+#ifdef DEBUG_ARG_SLOTS
             assert(info->numSlots == 2);
+#endif
             newArg->SetRegNum(REG_STK);
             BlockRange().InsertBefore(arg, fieldList, newArg);
         }
@@ -5337,7 +5339,7 @@ GenTree* Lowering::LowerConstIntDivOrMod(GenTree* node)
             return nullptr;
         }
 
-#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
+#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
         ssize_t magic;
         int     shift;
 
@@ -5801,7 +5803,7 @@ PhaseStatus Lowering::DoPhase()
         InsertPInvokeMethodProlog();
     }
 
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32)
     DecomposeLongs decomp(comp); // Initialize the long decomposition class.
     if (comp->compLongUsed)
     {
@@ -5814,7 +5816,7 @@ PhaseStatus Lowering::DoPhase()
         /* Make the block publicly available */
         comp->compCurBB = block;
 
-#if !defined(TARGET_64BIT)
+#if !defined(TARGET_64BIT) && !defined(TARGET_WASM32)
         if (comp->compLongUsed)
         {
             decomp.DecomposeBlock(block);
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index c8500c0636b9..e63955921580 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -97,7 +97,7 @@ class Lowering final : public Phase
     void ContainCheckCompare(GenTreeOp* node);
     void ContainCheckBinary(GenTreeOp* node);
     void ContainCheckBoundsChk(GenTreeBoundsChk* node);
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
     void ContainCheckFloatBinary(GenTreeOp* node);
     void ContainCheckIntrinsic(GenTreeOp* node);
 #endif // TARGET_XARCH
@@ -228,7 +228,7 @@ class Lowering final : public Phase
     // return true if this call target is within range of a pc-rel call on the machine
     bool IsCallTargetInRange(void* addr);
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
     GenTree* PreferredRegOptionalOperand(GenTree* tree);
 
     // ------------------------------------------------------------------
diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp
new file mode 100644
index 000000000000..4efd683c830c
--- /dev/null
+++ b/src/coreclr/jit/lowerwasm.cpp
@@ -0,0 +1,4907 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                           Lowering for AMD64, x86                         XX
+XX                                                                           XX
+XX  This encapsulates all the logic for lowering trees for the AMD64         XX
+XX  architecture.  For a more detailed view of what is lowering, please      XX
+XX  take a look at Lower.cpp                                                 XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#if defined (TARGET_WASM32) || defined(TARGET_WASM64) // This file is only used for wasm
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+// xarch supports both ROL and ROR instructions so no lowering is required.
+void Lowering::LowerRotate(GenTree* tree)
+{
+    ContainCheckShiftRotate(tree->AsOp());
+}
+
+//------------------------------------------------------------------------
+// LowerStoreLoc: Lower a store of a lclVar
+//
+// Arguments:
+//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
+//
+// Notes:
+//    This involves:
+//    - Handling of contained immediates.
+//    - Widening operations of unsigneds.
+
+void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
+{
+    // Try to widen the ops if they are going into a local var.
+    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
+    {
+        GenTreeIntCon* con  = storeLoc->gtOp1->AsIntCon();
+        ssize_t        ival = con->gtIconVal;
+
+        unsigned   varNum = storeLoc->GetLclNum();
+        LclVarDsc* varDsc = comp->lvaTable + varNum;
+
+        if (varDsc->lvIsSIMDType())
+        {
+            noway_assert(storeLoc->gtType != TYP_STRUCT);
+        }
+        unsigned size = genTypeSize(storeLoc);
+        // If we are storing a constant into a local variable
+        // we extend the size of the store here
+        if ((size < 4) && !varTypeIsStruct(varDsc))
+        {
+            if (!varTypeIsUnsigned(varDsc))
+            {
+                if (genTypeSize(storeLoc) == 1)
+                {
+                    if ((ival & 0x7f) != ival)
+                    {
+                        ival = ival | 0xffffff00;
+                    }
+                }
+                else
+                {
+                    assert(genTypeSize(storeLoc) == 2);
+                    if ((ival & 0x7fff) != ival)
+                    {
+                        ival = ival | 0xffff0000;
+                    }
+                }
+            }
+
+            // A local stack slot is at least 4 bytes in size, regardless of
+            // what the local var is typed as, so auto-promote it here
+            // unless it is a field of a promoted struct
+            // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
+            if (!varDsc->lvIsStructField)
+            {
+                storeLoc->gtType = TYP_INT;
+                con->SetIconValue(ival);
+            }
+        }
+    }
+    if (storeLoc->OperIs(GT_STORE_LCL_FLD))
+    {
+        // We should only encounter this for lclVars that are lvDoNotEnregister.
+        verifyLclFldDoNotEnregister(storeLoc->GetLclNum());
+    }
+    ContainCheckStoreLoc(storeLoc);
+}
+
+//------------------------------------------------------------------------
+// LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
+//
+// Arguments:
+//    node       - The indirect store node (GT_STORE_IND) of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::LowerStoreIndir(GenTreeIndir* node)
+{
+    // Mark all GT_STOREIND nodes to indicate that it is not known
+    // whether it represents a RMW memory op.
+    node->AsStoreInd()->SetRMWStatusDefault();
+
+    if (!varTypeIsFloating(node))
+    {
+        // Perform recognition of trees with the following structure:
+        //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
+        // to be able to fold this into an instruction of the form
+        //        BINOP [addr], register
+        // where register is the actual place where 'expr' is computed.
+        //
+        // SSE2 doesn't support RMW form of instructions.
+        if (LowerRMWMemOp(node))
+        {
+            return;
+        }
+    }
+    ContainCheckStoreIndir(node);
+}
+
+//------------------------------------------------------------------------
+// LowerBlockStore: Lower a block store node
+//
+// Arguments:
+//    blkNode - The block store node to lower
+//
+void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// ContainBlockStoreAddress: Attempt to contain an address used by an unrolled block store.
+//
+// Arguments:
+//    blkNode - the block store node
+//    size - the block size
+//    addr - the address node to try to contain
+//
+void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenTree* addr)
+{
+    assert(blkNode->OperIs(GT_STORE_BLK) && (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll));
+    assert(size < INT32_MAX);
+
+    if (addr->OperIsLocalAddr())
+    {
+        addr->SetContained();
+        return;
+    }
+
+    if (!addr->OperIsAddrMode() && !TryCreateAddrMode(addr, true))
+    {
+        return;
+    }
+
+    GenTreeAddrMode* addrMode = addr->AsAddrMode();
+
+    // On x64 the address mode displacement is signed so it must not exceed INT32_MAX. This check is
+    // an approximation since the last displacement we generate in an unrolled block operation can be
+    // up to 16 bytes lower than offset + size. But offsets large enough to hit this case are likely
+    // to be extremely rare for this to ever be a CQ issue.
+    // On x86 this shouldn't be needed but then again, offsets large enough to hit this are rare.
+    if (addrMode->Offset() > (INT32_MAX - static_cast<int>(size)))
+    {
+        return;
+    }
+
+    // Note that the parentNode is always the block node, even if we're dealing with the source address.
+    // The source address is not directly used by the block node but by an IND node and that IND node is
+    // always contained.
+    if (!IsSafeToContainMem(blkNode, addrMode))
+    {
+        return;
+    }
+
+    addrMode->SetContained();
+}
+
+//------------------------------------------------------------------------
+// LowerPutArgStk: Lower a GT_PUTARG_STK.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    GenTree* src = putArgStk->gtGetOp1();
+
+    if (src->OperIs(GT_FIELD_LIST))
+    {
+#ifdef TARGET_X86
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
+
+        GenTreeFieldList* fieldList = src->AsFieldList();
+
+        // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
+        // of uses is visible to LSRA.
+        assert(fieldList->Uses().IsSorted());
+        fieldList->Uses().Reverse();
+
+        // Now that the fields have been sorted, the kind of code we will generate.
+        bool     allFieldsAreSlots = true;
+        unsigned prevOffset        = putArgStk->GetStackByteSize();
+        for (GenTreeFieldList::Use& use : fieldList->Uses())
+        {
+            GenTree* const  fieldNode   = use.GetNode();
+            const var_types fieldType   = fieldNode->TypeGet();
+            const unsigned  fieldOffset = use.GetOffset();
+            assert(fieldType != TYP_LONG);
+
+            // We can treat as a slot any field that is stored at a slot boundary, where the previous
+            // field is not in the same slot. (Note that we store the fields in reverse order.)
+            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
+            if (!fieldIsSlot)
+            {
+                allFieldsAreSlots = false;
+            }
+
+            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
+            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
+            // registers to be consumed atomically by the call.
+            if (varTypeIsIntegralOrI(fieldNode))
+            {
+                if (fieldNode->OperGet() == GT_LCL_VAR)
+                {
+                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->GetLclNum()]);
+                    if (!varDsc->lvDoNotEnregister)
+                    {
+                        fieldNode->SetRegOptional();
+                    }
+                    else
+                    {
+                        MakeSrcContained(putArgStk, fieldNode);
+                    }
+                }
+                else if (fieldNode->IsIntCnsFitsInI32())
+                {
+                    MakeSrcContained(putArgStk, fieldNode);
+                }
+                else
+                {
+                    // For the case where we cannot directly push the value, if we run out of registers,
+                    // it would be better to defer computation until we are pushing the arguments rather
+                    // than spilling, but this situation is not all that common, as most cases of promoted
+                    // structs do not have a large number of fields, and of those most are lclVars or
+                    // copy-propagated constants.
+                    fieldNode->SetRegOptional();
+                }
+            }
+
+            prevOffset = fieldOffset;
+        }
+
+        // Set the copy kind.
+        // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
+        // adjust the stack once for those fields. The latter is really best done in code generation, but
+        // this tuning should probably be undertaken as a whole.
+        // Also, if there are  floating point fields, it may be better to use the "Unroll" mode
+        // of copying the struct as a whole, if the fields are not register candidates.
+        if (allFieldsAreSlots)
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
+        }
+        else
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+        }
+#endif // TARGET_X86
+        return;
+    }
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+    if (src->TypeGet() != TYP_STRUCT)
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+    {
+        // If the child of GT_PUTARG_STK is a constant, we don't need a register to
+        // move it to memory (stack location).
+        //
+        // On AMD64, we don't want to make 0 contained, because we can generate smaller code
+        // by zeroing a register and then storing it. E.g.:
+        //      xor rdx, rdx
+        //      mov gword ptr [rsp+28H], rdx
+        // is 2 bytes smaller than:
+        //      mov gword ptr [rsp+28H], 0
+        //
+        // On x86, we push stack arguments; we don't use 'mov'. So:
+        //      push 0
+        // is 1 byte smaller than:
+        //      xor rdx, rdx
+        //      push rdx
+
+        if (IsContainableImmed(putArgStk, src)
+#if defined(TARGET_AMD64)
+            && !src->IsIntegralConst(0)
+#endif // TARGET_AMD64
+                )
+        {
+            MakeSrcContained(putArgStk, src);
+        }
+        return;
+    }
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+    GenTree* srcAddr = nullptr;
+
+    bool haveLocalAddr = false;
+    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
+    {
+        srcAddr = src->AsOp()->gtOp1;
+        assert(srcAddr != nullptr);
+        haveLocalAddr = srcAddr->OperIsLocalAddr();
+    }
+    else
+    {
+        assert(varTypeIsSIMD(putArgStk));
+    }
+
+    ClassLayout* layout = src->AsObj()->GetLayout();
+
+    // In case of a CpBlk we could use a helper call. In case of putarg_stk we
+    // can't do that since the helper call could kill some already set up outgoing args.
+    // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
+    // The cpyXXXX code is rather complex and this could cause it to be more complex, but
+    // it might be the right thing to do.
+
+    unsigned size = putArgStk->GetStackByteSize();
+
+    // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
+    // (I don't know which).
+
+    if (size <= CPBLK_UNROLL_LIMIT && !layout->HasGCPtr())
+    {
+#ifdef TARGET_X86
+        if (size < XMM_REGSIZE_BYTES)
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+        }
+        else
+#endif // TARGET_X86
+        {
+            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
+        }
+    }
+#ifdef TARGET_X86
+    else if (layout->HasGCPtr())
+    {
+        // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
+        // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
+    }
+#endif // TARGET_X86
+    else
+    {
+        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
+    }
+    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
+    MakeSrcContained(putArgStk, src);
+    if (haveLocalAddr)
+    {
+        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
+        // copies.
+        //
+        MakeSrcContained(putArgStk, srcAddr);
+    }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+}
+
+/* Lower GT_CAST(srcType, DstType) nodes.
+ *
+ * Casts from small int type to float/double are transformed as follows:
+ * GT_CAST(byte, float/double)     =   GT_CAST(GT_CAST(byte, int32), float/double)
+ * GT_CAST(sbyte, float/double)    =   GT_CAST(GT_CAST(sbyte, int32), float/double)
+ * GT_CAST(int16, float/double)    =   GT_CAST(GT_CAST(int16, int32), float/double)
+ * GT_CAST(uint16, float/double)   =   GT_CAST(GT_CAST(uint16, int32), float/double)
+ *
+ * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
+ * are morphed as follows by front-end and hence should not be seen here.
+ * GT_CAST(uint32, float/double)   =   GT_CAST(GT_CAST(uint32, long), float/double)
+ * GT_CAST(uint64, float)          =   GT_CAST(GT_CAST(uint64, double), float)
+ *
+ *
+ * Similarly casts from float/double to a smaller int type are transformed as follows:
+ * GT_CAST(float/double, byte)     =   GT_CAST(GT_CAST(float/double, int32), byte)
+ * GT_CAST(float/double, sbyte)    =   GT_CAST(GT_CAST(float/double, int32), sbyte)
+ * GT_CAST(float/double, int16)    =   GT_CAST(GT_CAST(double/double, int32), int16)
+ * GT_CAST(float/double, uint16)   =   GT_CAST(GT_CAST(double/double, int32), uint16)
+ *
+ * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
+ * integer.  The above transformations help us to leverage those instructions.
+ *
+ * Note that for the following conversions we still depend on helper calls and
+ * don't expect to see them here.
+ *  i) GT_CAST(float/double, uint64)
+ * ii) GT_CAST(float/double, int type with overflow detection)
+ *
+ * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
+ * There are hardly any occurrences of this conversion operation in platform
+ * assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript,
+ * 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics
+ * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
+ * doing this optimization is a win, should consider generating in-lined code.
+ */
+void Lowering::LowerCast(GenTree* tree)
+{
+    assert(tree->OperGet() == GT_CAST);
+
+    GenTree*  castOp     = tree->AsCast()->CastOp();
+    var_types castToType = tree->CastToType();
+    var_types srcType    = castOp->TypeGet();
+    var_types tmpType    = TYP_UNDEF;
+
+    // force the srcType to unsigned if GT_UNSIGNED flag is set
+    if (tree->gtFlags & GTF_UNSIGNED)
+    {
+        srcType = genUnsignedType(srcType);
+    }
+
+    // We should never see the following casts as they are expected to be lowered
+    // apropriately or converted into helper calls by front-end.
+    //   srcType = float/double                    castToType = * and overflow detecting cast
+    //       Reason: must be converted to a helper call
+    //   srcType = float/double,                   castToType = ulong
+    //       Reason: must be converted to a helper call
+    //   srcType = uint                            castToType = float/double
+    //       Reason: uint -> float/double = uint -> long -> float/double
+    //   srcType = ulong                           castToType = float
+    //       Reason: ulong -> float = ulong -> double -> float
+    if (varTypeIsFloating(srcType))
+    {
+        noway_assert(!tree->gtOverflow());
+        noway_assert(castToType != TYP_ULONG);
+    }
+    else if (srcType == TYP_UINT)
+    {
+        noway_assert(!varTypeIsFloating(castToType));
+    }
+    else if (srcType == TYP_ULONG)
+    {
+        noway_assert(castToType != TYP_FLOAT);
+    }
+
+    // Case of src is a small type and dst is a floating point type.
+    if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
+    {
+        // These conversions can never be overflow detecting ones.
+        noway_assert(!tree->gtOverflow());
+        tmpType = TYP_INT;
+    }
+    // case of src is a floating point type and dst is a small type.
+    else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
+    {
+        tmpType = TYP_INT;
+    }
+
+    if (tmpType != TYP_UNDEF)
+    {
+        GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
+        tmp->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
+
+        tree->gtFlags &= ~GTF_UNSIGNED;
+        tree->AsOp()->gtOp1 = tmp;
+        BlockRange().InsertAfter(castOp, tmp);
+        ContainCheckCast(tmp->AsCast());
+    }
+
+    // Now determine if we have operands that should be contained.
+    ContainCheckCast(tree->AsCast());
+}
+
+#ifdef FEATURE_SIMD
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
+//
+//  Arguments:
+//     simdNode - The SIMD intrinsic node.
+//
+void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
+{
+    if (simdNode->TypeGet() == TYP_SIMD12)
+    {
+        // GT_SIMD node requiring to produce TYP_SIMD12 in fact
+        // produces a TYP_SIMD16 result
+        simdNode->gtType = TYP_SIMD16;
+    }
+
+    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN)
+    {
+        assert(simdNode->gtSIMDBaseType == TYP_FLOAT);
+
+        int   argCount      = 0;
+        int   constArgCount = 0;
+        float constArgValues[4]{0, 0, 0, 0};
+
+        for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
+        {
+            GenTree* arg = list->Current();
+
+            assert(arg->TypeGet() == simdNode->gtSIMDBaseType);
+            assert(argCount < (int)_countof(constArgValues));
+
+            if (arg->IsCnsFltOrDbl())
+            {
+                constArgValues[constArgCount] = static_cast<float>(arg->AsDblCon()->gtDconVal);
+                constArgCount++;
+            }
+
+            argCount++;
+        }
+
+        if (constArgCount == argCount)
+        {
+            for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
+            {
+                BlockRange().Remove(list->Current());
+            }
+
+            assert(sizeof(constArgValues) == 16);
+
+            unsigned cnsSize  = sizeof(constArgValues);
+            unsigned cnsAlign = (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : 1;
+
+            CORINFO_FIELD_HANDLE hnd =
+                comp->GetEmitter()->emitBlkConst(constArgValues, cnsSize, cnsAlign, simdNode->gtSIMDBaseType);
+            GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
+            BlockRange().InsertBefore(simdNode, clsVarAddr);
+            simdNode->ChangeOper(GT_IND);
+            simdNode->gtOp1 = clsVarAddr;
+            ContainCheckIndir(simdNode->AsIndir());
+
+            return;
+        }
+    }
+
+#ifdef TARGET_XARCH
+    if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
+    {
+        // If SIMD vector is already in memory, we force its
+        // addr to be evaluated into a reg.  This would allow
+        // us to generate [regBase] or [regBase+offset] or
+        // [regBase+sizeOf(SIMD vector baseType)*regIndex]
+        // to access the required SIMD vector element directly
+        // from memory.
+        //
+        // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
+        // might be able update GT_LEA to fold the regIndex
+        // or offset in some cases.  Instead with this
+        // approach we always evaluate GT_LEA into a reg.
+        // Ideally, we should be able to lower GetItem intrinsic
+        // into GT_IND(newAddr) where newAddr combines
+        // the addr of SIMD vector with the given index.
+        simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
+    }
+#endif
+    ContainCheckSIMD(simdNode);
+}
+#endif // FEATURE_SIMD
+
+#ifdef FEATURE_HW_INTRINSICS
+
+//----------------------------------------------------------------------------------------------
+// LowerHWIntrinsicCC: Lowers a hardware intrinsic node that produces a boolean value by
+//     setting the condition flags.
+//
+//  Arguments:
+//     node - The hardware intrinsic node
+//     newIntrinsicId - The intrinsic id of the lowered intrinsic node
+//     condition - The condition code of the generated SETCC/JCC node
+//
+void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition)
+{
+    GenTreeCC* cc = LowerNodeCC(node, condition);
+
+    node->gtHWIntrinsicId = newIntrinsicId;
+    node->gtType          = TYP_VOID;
+    node->ClearUnusedValue();
+
+    bool swapOperands    = false;
+    bool canSwapOperands = false;
+
+    switch (newIntrinsicId)
+    {
+        case NI_SSE_COMISS:
+        case NI_SSE_UCOMISS:
+        case NI_SSE2_COMISD:
+        case NI_SSE2_UCOMISD:
+            // In some cases we can generate better code if we swap the operands:
+            //   - If the condition is not one of the "preferred" floating point conditions we can swap
+            //     the operands and change the condition to avoid generating an extra JP/JNP branch.
+            //   - If the first operand can be contained but the second cannot, we can swap operands in
+            //     order to be able to contain the first operand and avoid the need for a temp reg.
+            // We can't handle both situations at the same time and since an extra branch is likely to
+            // be worse than an extra temp reg (x64 has a reasonable number of XMM registers) we'll favor
+            // the branch case:
+            //   - If the condition is not preferred then swap, even if doing this will later prevent
+            //     containment.
+            //   - Allow swapping for containment purposes only if this doesn't result in a non-"preferred"
+            //     condition being generated.
+            if ((cc != nullptr) && cc->gtCondition.PreferSwap())
+            {
+                swapOperands = true;
+            }
+            else
+            {
+                canSwapOperands = (cc == nullptr) || !GenCondition::Swap(cc->gtCondition).PreferSwap();
+            }
+            break;
+
+        case NI_SSE41_PTEST:
+        case NI_AVX_PTEST:
+            // If we need the Carry flag then we can't swap operands.
+            canSwapOperands = (cc == nullptr) || cc->gtCondition.Is(GenCondition::EQ, GenCondition::NE);
+            break;
+
+        default:
+            unreached();
+    }
+
+    if (canSwapOperands)
+    {
+        bool op1SupportsRegOptional = false;
+        bool op2SupportsRegOptional = false;
+
+        if (!IsContainableHWIntrinsicOp(node, node->gtGetOp2(), &op2SupportsRegOptional) &&
+            IsContainableHWIntrinsicOp(node, node->gtGetOp1(), &op1SupportsRegOptional))
+        {
+            // Swap operands if op2 cannot be contained but op1 can.
+            swapOperands = true;
+        }
+    }
+
+    if (swapOperands)
+    {
+        std::swap(node->gtOp1, node->gtOp2);
+
+        if (cc != nullptr)
+        {
+            cc->gtCondition = GenCondition::Swap(cc->gtCondition);
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------------------
+// LowerFusedMultiplyAdd: Changes NI_FMA_MultiplyAddScalar produced by Math(F).FusedMultiplyAdd
+//     to a better FMA intrinsics if there are GT_NEG around in order to eliminate them.
+//
+//  Arguments:
+//     node - The hardware intrinsic node
+//
+//  Notes:
+//     Math(F).FusedMultiplyAdd is expanded into NI_FMA_MultiplyAddScalar and
+//     depending on additional GT_NEG nodes around it can be:
+//
+//      x *  y + z -> NI_FMA_MultiplyAddScalar
+//      x * -y + z -> NI_FMA_MultiplyAddNegatedScalar
+//     -x *  y + z -> NI_FMA_MultiplyAddNegatedScalar
+//     -x * -y + z -> NI_FMA_MultiplyAddScalar
+//      x *  y - z -> NI_FMA_MultiplySubtractScalar
+//      x * -y - z -> NI_FMA_MultiplySubtractNegatedScalar
+//     -x *  y - z -> NI_FMA_MultiplySubtractNegatedScalar
+//     -x * -y - z -> NI_FMA_MultiplySubtractScalar
+//
+void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node)
+{
+    assert(node->gtHWIntrinsicId == NI_FMA_MultiplyAddScalar);
+    GenTreeArgList*     argList = node->gtGetOp1()->AsArgList();
+    GenTreeHWIntrinsic* createScalarOps[3];
+
+    for (GenTreeHWIntrinsic*& createScalarOp : createScalarOps)
+    {
+        GenTree*& current = argList->Current();
+        assert(current != nullptr);
+        if (!current->OperIsHWIntrinsic())
+        {
+            return; // Math(F).FusedMultiplyAdd is expected to emit three NI_Vector128_CreateScalarUnsafe
+                    // but it's also possible to use NI_FMA_MultiplyAddScalar directly with any operands
+        }
+        GenTreeHWIntrinsic* hwArg = current->AsHWIntrinsic();
+        if (hwArg->gtHWIntrinsicId != NI_Vector128_CreateScalarUnsafe)
+        {
+            return;
+        }
+        createScalarOp = hwArg;
+        argList        = argList->Rest();
+    }
+    assert(argList == nullptr);
+
+    GenTree* argX = createScalarOps[0]->gtGetOp1();
+    GenTree* argY = createScalarOps[1]->gtGetOp1();
+    GenTree* argZ = createScalarOps[2]->gtGetOp1();
+
+    const bool negMul = argX->OperIs(GT_NEG) != argY->OperIs(GT_NEG);
+    if (argX->OperIs(GT_NEG))
+    {
+        createScalarOps[0]->gtOp1 = argX->gtGetOp1();
+        BlockRange().Remove(argX);
+    }
+    if (argY->OperIs(GT_NEG))
+    {
+        createScalarOps[1]->gtOp1 = argY->gtGetOp1();
+        BlockRange().Remove(argY);
+    }
+    if (argZ->OperIs(GT_NEG))
+    {
+        createScalarOps[2]->gtOp1 = argZ->gtGetOp1();
+        BlockRange().Remove(argZ);
+        node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplySubtractNegatedScalar : NI_FMA_MultiplySubtractScalar;
+    }
+    else
+    {
+        node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplyAddNegatedScalar : NI_FMA_MultiplyAddScalar;
+    }
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
+{
+    if (node->TypeGet() == TYP_SIMD12)
+    {
+        // GT_HWINTRINSIC node requiring to produce TYP_SIMD12 in fact
+        // produces a TYP_SIMD16 result
+        node->gtType = TYP_SIMD16;
+    }
+
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+
+    switch (intrinsicId)
+    {
+        case NI_Vector128_Create:
+        case NI_Vector256_Create:
+        {
+            // We don't directly support the Vector128.Create or Vector256.Create methods in codegen
+            // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect
+            // that the node is modified to either not be a HWIntrinsic node or that it is no longer
+            // the same intrinsic as when it came in. In the case of Vector256.Create, we may lower
+            // it into 2x Vector128.Create intrinsics which themselves are also lowered into other
+            // intrinsics that are not Vector*.Create
+
+            LowerHWIntrinsicCreate(node);
+            assert(!node->OperIsHWIntrinsic() || (node->gtHWIntrinsicId != intrinsicId));
+            LowerNode(node);
+            return;
+        }
+
+        case NI_Vector128_Dot:
+        case NI_Vector256_Dot:
+        {
+            LowerHWIntrinsicDot(node);
+            return;
+        }
+
+        case NI_Vector128_op_Equality:
+        case NI_Vector256_op_Equality:
+        {
+            LowerHWIntrinsicCmpOp(node, GT_EQ);
+            return;
+        }
+
+        case NI_Vector128_op_Inequality:
+        case NI_Vector256_op_Inequality:
+        {
+            LowerHWIntrinsicCmpOp(node, GT_NE);
+            return;
+        }
+
+        case NI_Vector128_ToScalar:
+        case NI_Vector256_ToScalar:
+        {
+            LowerHWIntrinsicToScalar(node);
+            break;
+        }
+
+        case NI_SSE2_Insert:
+        case NI_SSE41_Insert:
+        case NI_SSE41_X64_Insert:
+        {
+            assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
+
+            GenTreeArgList* argList = node->gtOp1->AsArgList();
+
+            // Insert takes either a 32-bit register or a memory operand.
+            // In either case, only gtSIMDBaseType bits are read and so
+            // widening or narrowing the operand may be unnecessary and it
+            // can just be used directly.
+
+            argList->Rest()->gtOp1 = TryRemoveCastIfPresent(node->gtSIMDBaseType, argList->Rest()->gtOp1);
+            break;
+        }
+
+        case NI_SSE42_Crc32:
+        {
+            assert(HWIntrinsicInfo::lookupNumArgs(node) == 2);
+
+            // Crc32 takes either a bit register or a memory operand.
+            // In either case, only gtType bits are read and so widening
+            // or narrowing the operand may be unnecessary and it can
+            // just be used directly.
+
+            node->gtOp2 = TryRemoveCastIfPresent(node->gtType, node->gtOp2);
+            break;
+        }
+
+        case NI_SSE2_CompareGreaterThan:
+        {
+            if (node->gtSIMDBaseType != TYP_DOUBLE)
+            {
+                assert(varTypeIsIntegral(node->gtSIMDBaseType));
+                break;
+            }
+
+            FALLTHROUGH;
+        }
+
+        case NI_SSE_CompareGreaterThan:
+        case NI_SSE_CompareGreaterThanOrEqual:
+        case NI_SSE_CompareNotGreaterThan:
+        case NI_SSE_CompareNotGreaterThanOrEqual:
+        case NI_SSE2_CompareGreaterThanOrEqual:
+        case NI_SSE2_CompareNotGreaterThan:
+        case NI_SSE2_CompareNotGreaterThanOrEqual:
+        {
+            assert((node->gtSIMDBaseType == TYP_FLOAT) || (node->gtSIMDBaseType == TYP_DOUBLE));
+
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+            {
+                break;
+            }
+
+            // pre-AVX doesn't actually support these intrinsics in hardware so we need to swap the operands around
+            std::swap(node->gtOp1, node->gtOp2);
+            break;
+        }
+
+        case NI_SSE2_CompareLessThan:
+        case NI_SSE42_CompareLessThan:
+        case NI_AVX2_CompareLessThan:
+        {
+            if (node->gtSIMDBaseType == TYP_DOUBLE)
+            {
+                break;
+            }
+            assert(varTypeIsIntegral(node->gtSIMDBaseType));
+
+            // this isn't actually supported in hardware so we need to swap the operands around
+            std::swap(node->gtOp1, node->gtOp2);
+            break;
+        }
+
+        case NI_SSE_CompareScalarOrderedEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FEQ);
+            break;
+        case NI_SSE_CompareScalarOrderedNotEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FNEU);
+            break;
+        case NI_SSE_CompareScalarOrderedLessThan:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FLT);
+            break;
+        case NI_SSE_CompareScalarOrderedLessThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FLE);
+            break;
+        case NI_SSE_CompareScalarOrderedGreaterThan:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FGT);
+            break;
+        case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FGE);
+            break;
+
+        case NI_SSE_CompareScalarUnorderedEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FEQ);
+            break;
+        case NI_SSE_CompareScalarUnorderedNotEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FNEU);
+            break;
+        case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FLE);
+            break;
+        case NI_SSE_CompareScalarUnorderedLessThan:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FLT);
+            break;
+        case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FGE);
+            break;
+        case NI_SSE_CompareScalarUnorderedGreaterThan:
+            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FGT);
+            break;
+
+        case NI_SSE2_CompareScalarOrderedEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FEQ);
+            break;
+        case NI_SSE2_CompareScalarOrderedNotEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FNEU);
+            break;
+        case NI_SSE2_CompareScalarOrderedLessThan:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FLT);
+            break;
+        case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FLE);
+            break;
+        case NI_SSE2_CompareScalarOrderedGreaterThan:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FGT);
+            break;
+        case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FGE);
+            break;
+
+        case NI_SSE2_CompareScalarUnorderedEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FEQ);
+            break;
+        case NI_SSE2_CompareScalarUnorderedNotEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FNEU);
+            break;
+        case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FLE);
+            break;
+        case NI_SSE2_CompareScalarUnorderedLessThan:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FLT);
+            break;
+        case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FGE);
+            break;
+        case NI_SSE2_CompareScalarUnorderedGreaterThan:
+            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FGT);
+            break;
+
+        case NI_SSE41_TestC:
+            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::C);
+            break;
+        case NI_SSE41_TestZ:
+            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::EQ);
+            break;
+        case NI_SSE41_TestNotZAndNotC:
+            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::UGT);
+            break;
+
+        case NI_AVX_TestC:
+            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::C);
+            break;
+        case NI_AVX_TestZ:
+            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::EQ);
+            break;
+        case NI_AVX_TestNotZAndNotC:
+            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::UGT);
+            break;
+
+        case NI_FMA_MultiplyAddScalar:
+            LowerFusedMultiplyAdd(node);
+            break;
+
+        default:
+            break;
+    }
+
+    ContainCheckHWIntrinsic(node);
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic
+//
+//  Arguments:
+//     node  - The hardware intrinsic node.
+//     cmpOp - The comparison operation, currently must be GT_EQ or GT_NE
+//
+void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    var_types      baseType    = node->gtSIMDBaseType;
+    unsigned       simdSize    = node->gtSIMDSize;
+    var_types      simdType    = Compiler::getSIMDTypeForSize(simdSize);
+
+    assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) ||
+           (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality));
+
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+    assert(node->gtType == TYP_BOOL);
+    assert((cmpOp == GT_EQ) || (cmpOp == GT_NE));
+
+    // We have the following (with the appropriate simd size and where the intrinsic could be op_Inequality):
+    //          /--*  op2  simd
+    //          /--*  op1  simd
+    //   node = *  HWINTRINSIC   simd   T op_Equality
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+
+    GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE;
+
+    if (op2->IsIntegralConstVector(0) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+    {
+        // On SSE4.1 or higher we can optimize comparisons against zero to
+        // just use PTEST. We can't support it for floating-point, however,
+        // as it has both +0.0 and -0.0 where +0.0 == -0.0
+
+        node->gtOp1 = op1;
+        BlockRange().Remove(op2);
+
+        LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(op1Use);
+        op1 = node->gtOp1;
+
+        op2 = comp->gtClone(op1);
+        BlockRange().InsertAfter(op1, op2);
+        node->gtOp2 = op2;
+
+        if (simdSize == 32)
+        {
+            node->gtHWIntrinsicId = NI_AVX_TestZ;
+            LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
+        }
+        else
+        {
+            node->gtHWIntrinsicId = NI_SSE41_TestZ;
+            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
+        }
+
+        return;
+    }
+
+    NamedIntrinsic cmpIntrinsic;
+    var_types      cmpType;
+    NamedIntrinsic mskIntrinsic;
+    var_types      mskType;
+    int            mskConstant;
+
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        case TYP_SHORT:
+        case TYP_USHORT:
+        case TYP_INT:
+        case TYP_UINT:
+        {
+            cmpType = baseType;
+            mskType = TYP_UBYTE;
+
+            if (simdSize == 32)
+            {
+                cmpIntrinsic = NI_AVX2_CompareEqual;
+                mskIntrinsic = NI_AVX2_MoveMask;
+                mskConstant  = -1;
+            }
+            else
+            {
+                assert(simdSize == 16);
+
+                cmpIntrinsic = NI_SSE2_CompareEqual;
+                mskIntrinsic = NI_SSE2_MoveMask;
+                mskConstant  = 0xFFFF;
+            }
+            break;
+        }
+
+        case TYP_LONG:
+        case TYP_ULONG:
+        {
+            mskType = TYP_UBYTE;
+
+            if (simdSize == 32)
+            {
+                cmpIntrinsic = NI_AVX2_CompareEqual;
+                cmpType      = baseType;
+                mskIntrinsic = NI_AVX2_MoveMask;
+                mskConstant  = -1;
+            }
+            else
+            {
+                assert(simdSize == 16);
+
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    cmpIntrinsic = NI_SSE41_CompareEqual;
+                    cmpType      = baseType;
+                }
+                else
+                {
+                    cmpIntrinsic = NI_SSE2_CompareEqual;
+                    cmpType      = TYP_UINT;
+                }
+
+                mskIntrinsic = NI_SSE2_MoveMask;
+                mskConstant  = 0xFFFF;
+            }
+            break;
+        }
+
+        case TYP_FLOAT:
+        {
+            cmpType = baseType;
+            mskType = baseType;
+
+            if (simdSize == 32)
+            {
+                cmpIntrinsic = NI_AVX_CompareEqual;
+                mskIntrinsic = NI_AVX_MoveMask;
+                mskConstant  = 0xFF;
+            }
+            else
+            {
+                cmpIntrinsic = NI_SSE_CompareEqual;
+                mskIntrinsic = NI_SSE_MoveMask;
+
+                if (simdSize == 16)
+                {
+                    mskConstant = 0xF;
+                }
+                else if (simdSize == 12)
+                {
+                    mskConstant = 0x7;
+                }
+                else
+                {
+                    assert(simdSize == 8);
+                    mskConstant = 0x3;
+                }
+            }
+            break;
+        }
+
+        case TYP_DOUBLE:
+        {
+            cmpType = baseType;
+            mskType = baseType;
+
+            if (simdSize == 32)
+            {
+                cmpIntrinsic = NI_AVX_CompareEqual;
+                mskIntrinsic = NI_AVX_MoveMask;
+                mskConstant  = 0xF;
+            }
+            else
+            {
+                assert(simdSize == 16);
+
+                cmpIntrinsic = NI_SSE2_CompareEqual;
+                mskIntrinsic = NI_SSE2_MoveMask;
+                mskConstant  = 0x3;
+            }
+            break;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, cmpType, simdSize);
+    BlockRange().InsertBefore(node, cmp);
+    LowerNode(cmp);
+
+    GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskType, simdSize);
+    BlockRange().InsertAfter(cmp, msk);
+    LowerNode(msk);
+
+    GenTree* mskCns = comp->gtNewIconNode(mskConstant, TYP_INT);
+    BlockRange().InsertAfter(msk, mskCns);
+
+    if ((baseType == TYP_FLOAT) && (simdSize < 16))
+    {
+        // For TYP_SIMD8 and TYP_SIMD12 we need to clear the upper bits and can't assume their value
+
+        GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns);
+        BlockRange().InsertAfter(mskCns, tmp);
+        LowerNode(tmp);
+
+        msk = tmp;
+
+        mskCns = comp->gtNewIconNode(mskConstant, TYP_INT);
+        BlockRange().InsertAfter(msk, mskCns);
+    }
+
+    node->ChangeOper(cmpOp);
+
+    node->gtType = TYP_INT;
+    node->gtOp1  = msk;
+    node->gtOp2  = mskCns;
+
+    GenTree* cc = LowerNodeCC(node, cmpCnd);
+
+    node->gtType = TYP_VOID;
+    node->ClearUnusedValue();
+
+    LowerNode(node);
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    var_types      simdType    = node->gtType;
+    var_types      baseType    = node->gtSIMDBaseType;
+    unsigned       simdSize    = node->gtSIMDSize;
+    VectorConstant vecCns      = {};
+
+    if ((simdSize == 8) && (simdType == TYP_DOUBLE))
+    {
+        // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to
+        //               manually fix it up so the simdType checks below are correct.
+        simdType = TYP_SIMD8;
+    }
+
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTreeArgList* argList = nullptr;
+    GenTree*        op1     = node->gtGetOp1();
+    GenTree*        op2     = node->gtGetOp2();
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+    GenTree* tmp3 = nullptr;
+
+    assert(op1 != nullptr);
+
+    unsigned argCnt    = 0;
+    unsigned cnsArgCnt = 0;
+
+    if (op1->OperIsList())
+    {
+        assert(op2 == nullptr);
+
+        for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+        {
+            if (HandleArgForHWIntrinsicCreate(argList->Current(), argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+    }
+    else
+    {
+        if (HandleArgForHWIntrinsicCreate(op1, argCnt, vecCns, baseType))
+        {
+            cnsArgCnt += 1;
+        }
+        argCnt += 1;
+
+        if (op2 != nullptr)
+        {
+            if (HandleArgForHWIntrinsicCreate(op2, argCnt, vecCns, baseType))
+            {
+                cnsArgCnt += 1;
+            }
+            argCnt += 1;
+        }
+        else if (cnsArgCnt == 1)
+        {
+            // These intrinsics are meant to set the same value to every element
+            // so we'll just specially handle it here and copy it into the remaining
+            // indices.
+
+            for (unsigned i = 1; i < simdSize / genTypeSize(baseType); i++)
+            {
+                HandleArgForHWIntrinsicCreate(op1, i, vecCns, baseType);
+            }
+        }
+    }
+    assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(baseType))));
+
+    if (argCnt == cnsArgCnt)
+    {
+        if (op1->OperIsList())
+        {
+            for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
+            {
+                GenTree* arg = argList->Current();
+
+#if !defined(TARGET_64BIT)
+                if (arg->OperIsLong())
+                {
+                    BlockRange().Remove(arg->AsOp()->gtOp1);
+                    BlockRange().Remove(arg->AsOp()->gtOp2);
+                }
+#endif // !TARGET_64BIT
+
+                BlockRange().Remove(arg);
+            }
+        }
+        else
+        {
+#if !defined(TARGET_64BIT)
+            if (op1->OperIsLong())
+            {
+                BlockRange().Remove(op1->AsOp()->gtOp1);
+                BlockRange().Remove(op1->AsOp()->gtOp2);
+            }
+#endif // !TARGET_64BIT
+
+            BlockRange().Remove(op1);
+
+            if (op2 != nullptr)
+            {
+#if defined(TARGET_64BIT)
+                if (op2->OperIsLong())
+                {
+                    BlockRange().Remove(op2->AsOp()->gtOp1);
+                    BlockRange().Remove(op2->AsOp()->gtOp2);
+                }
+#endif // !TARGET_64BIT
+
+                BlockRange().Remove(op2);
+            }
+        }
+
+        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32));
+
+        if ((argCnt == 1) ||
+            ((vecCns.i64[0] == vecCns.i64[1]) && ((simdSize <= 16) || (vecCns.i64[2] == vecCns.i64[3]))))
+        {
+            // If we are a single constant or if all parts are the same, we might be able to optimize
+            // this even further for certain values, such as Zero or AllBitsSet.
+
+            if (vecCns.i64[0] == 0)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_Zero;
+                return;
+            }
+            else if (vecCns.i64[0] == -1)
+            {
+                node->gtOp1 = nullptr;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet;
+                return;
+            }
+        }
+
+        unsigned cnsSize = (simdSize != 12) ? simdSize : 16;
+        unsigned cnsAlign =
+            (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : emitter::dataSection::MIN_DATA_ALIGN;
+        var_types dataType = Compiler::getSIMDTypeForSize(simdSize);
+
+        UNATIVE_OFFSET       cnum = comp->GetEmitter()->emitDataConst(&vecCns, cnsSize, cnsAlign, dataType);
+        CORINFO_FIELD_HANDLE hnd  = comp->eeFindJitDataOffs(cnum);
+        GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
+        BlockRange().InsertBefore(node, clsVarAddr);
+
+        node->ChangeOper(GT_IND);
+        node->gtOp1 = clsVarAddr;
+
+        // TODO-XARCH-CQ: We should be able to modify at least the paths that use Insert to trivially support partial
+        // vector constants. With this, we can create a constant if say 50% of the inputs are also constant and just
+        // insert the non-constant values which should still allow some gains.
+
+        return;
+    }
+    else if (argCnt == 1)
+    {
+        // We have the following (where simd is simd16 or simd32):
+        //          /--*  op1  T
+        //   node = *  HWINTRINSIC   simd   T Create
+
+        if (intrinsicId == NI_Vector256_Create)
+        {
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+            {
+                // We will be constructing the following parts:
+                //          /--*  op1  T
+                //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                //          /--*  tmp1 simd16
+                //   node = *  HWINTRINSIC   simd32 T BroadcastScalarToVector256
+
+                // This is roughly the following managed code:
+                //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+                //   return Avx2.BroadcastScalarToVector256(tmp1);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(op1, tmp1);
+                LowerNode(tmp1);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector256;
+                return;
+            }
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+            // We will be constructing the following parts:
+            //          /--*  op1  T
+            //   tmp1 = *  HWINTRINSIC   simd16 T Create
+            //          /--*  tmp1 simd16
+            //          *  STORE_LCL_VAR simd16
+            //   tmp1 =    LCL_VAR       simd16
+            //   tmp2 =    LCL_VAR       simd16
+            //          /--*  tmp2 simd16
+            //   tmp3 = *  HWINTRINSIC   simd16 T ToVector256Unsafe
+            //   idx  =    CNS_INT       int    0
+            //          /--*  tmp3 simd32
+            //          +--*  tmp1 simd16
+            //          +--*  idx  int
+            //   node = *  HWINTRINSIC simd32 T InsertVector128
+
+            // This is roughly the following managed code:
+            //   var tmp1 = Vector128.Create(op1);
+            //   var tmp2 = tmp1;
+            //   var tmp3 = tmp2.ToVector256Unsafe();
+            //   return Avx.InsertVector128(tmp3, tmp1, 0x01);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(op1, tmp1);
+            LowerNode(tmp1);
+
+            node->gtOp1 = tmp1;
+            LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+            ReplaceWithLclVar(tmp1Use);
+            tmp1 = node->gtOp1;
+
+            tmp2 = comp->gtClone(tmp1);
+            BlockRange().InsertAfter(tmp1, tmp2);
+
+            tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, tmp2, NI_Vector128_ToVector256Unsafe, baseType, 16);
+            BlockRange().InsertAfter(tmp2, tmp3);
+            LowerNode(tmp3);
+
+            idx = comp->gtNewIconNode(0x01, TYP_INT);
+            BlockRange().InsertAfter(tmp3, idx);
+
+            node->gtOp1 = comp->gtNewArgList(tmp3, tmp1, idx);
+            node->gtOp2 = nullptr;
+
+            node->gtHWIntrinsicId = NI_AVX_InsertVector128;
+            return;
+        }
+
+        // We will be constructing the following parts:
+        //          /--*  op1  T
+        //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+        //   ...
+
+        // This is roughly the following managed code:
+        //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+        //   ...
+
+        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+        BlockRange().InsertAfter(op1, tmp1);
+        LowerNode(tmp1);
+
+        if ((baseType != TYP_DOUBLE) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //           /--*  tmp1 simd16
+            //   node  = *  HWINTRINSIC   simd16 T BroadcastScalarToVector128
+
+            // This is roughly the following managed code:
+            //   ...
+            //   return Avx2.BroadcastScalarToVector128(tmp1);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = nullptr;
+
+            node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector128;
+            return;
+        }
+
+        switch (baseType)
+        {
+            case TYP_BYTE:
+            case TYP_UBYTE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   tmp2 =    HWINTRINSIC   simd16 ubyte get_Zero
+                    //         /--*  tmp1 simd16
+                    //         +--*  tmp2 simd16
+                    //   node = *  HWINTRINSIC   simd16 ubyte Shuffle
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   var tmp2 = Vector128<byte>.Zero;
+                    //   return Ssse3.Shuffle(tmp1, tmp2);
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, NI_Vector128_get_Zero, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(tmp1, tmp2);
+                    LowerNode(tmp2);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = tmp2;
+
+                    node->gtHWIntrinsicId = NI_SSSE3_Shuffle;
+                    break;
+                }
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp1 = *  HWINTRINSIC   simd16 ubyte UnpackLow
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
+                //   ...
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                BlockRange().InsertAfter(tmp2, tmp1);
+                LowerNode(tmp1);
+
+                FALLTHROUGH;
+            }
+
+            case TYP_SHORT:
+            case TYP_USHORT:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp1 = *  HWINTRINSIC   simd16 ushort UnpackLow
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
+                //   ...
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
+                BlockRange().InsertAfter(tmp2, tmp1);
+                LowerNode(tmp1);
+
+                FALLTHROUGH;
+            }
+
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //   idx  =    CNS_INT       int    0
+                //          /--*  tmp1 simd16
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 uint Shuffle
+
+                // This is roughly the following managed code:
+                //   ...
+                //   return Sse2.Shuffle(tmp1, 0x00);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                idx = comp->gtNewIconNode(0x00, TYP_INT);
+                BlockRange().InsertAfter(tmp1, idx);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = idx;
+
+                node->gtHWIntrinsicId = NI_SSE2_Shuffle;
+                node->gtSIMDBaseType  = TYP_UINT;
+
+                break;
+            }
+
+#if defined(TARGET_AMD64)
+            case TYP_LONG:
+            case TYP_ULONG:
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   node = *  HWINTRINSIC simd16 ulong UnpackLow
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse2.UnpackLow(tmp1, tmp2);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = tmp2;
+
+                node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+                break;
+            }
+#endif // TARGET_AMD64
+
+            case TYP_FLOAT:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    0
+                    //          /--*  tmp1 simd16
+                    //          +--*  idx  int
+                    //   node = *  HWINTRINSIC   simd16 float Permute
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   return Avx.Permute(tmp1, 0x00);
+
+                    idx = comp->gtNewIconNode(0x00, TYP_INT);
+                    BlockRange().InsertAfter(tmp1, idx);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = idx;
+
+                    node->gtHWIntrinsicId = NI_AVX_Permute;
+                    break;
+                }
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    0
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 float Shuffle
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse.Shuffle(tmp1, tmp2, 0x00);
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                idx = comp->gtNewIconNode(0x00, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE_Shuffle;
+                break;
+            }
+
+            case TYP_DOUBLE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //          /--*  tmp1 simd16
+                    //   node = *  HWINTRINSIC   simd16 double MoveAndDuplicate
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   return Sse3.MoveAndDuplicate(tmp1);
+
+                    node->gtOp1 = tmp1;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_SSE3_MoveAndDuplicate;
+                    break;
+                }
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   node = *  HWINTRINSIC   simd16 float MoveLowToHigh
+
+                // This is roughly the following managed code:
+                //   ...
+                //   var tmp2 = tmp1;
+                //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                node->gtOp1 = tmp1;
+                node->gtOp2 = tmp2;
+
+                node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+                node->gtSIMDBaseType  = TYP_FLOAT;
+
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        return;
+    }
+
+    // We have the following (where simd is simd16 or simd32):
+    //          /--*  op1 T
+    //          +--*  ... T
+    //          +--*  opN T
+    //   node = *  HWINTRINSIC   simd   T Create
+
+    if (intrinsicId == NI_Vector256_Create)
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+        // We will be constructing the following parts:
+        //          /--*  op1 T
+        //          +--*  ... T
+        //   lo   = *  HWINTRINSIC   simd16 T Create
+        //          /--*  ... T
+        //          +--*  opN T
+        //   hi   = *  HWINTRINSIC   simd16 T Create
+        //   idx  =    CNS_INT       int    1
+        //          /--*  lo   simd32
+        //          +--*  hi   simd16
+        //          +--*  idx  int
+        //   node = *  HWINTRINSIC   simd32 T InsertVector128
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var lo   = Vector128.Create(op1, ...);
+        //   var hi   = Vector128.Create(..., opN);
+        //   return Avx.InsertVector128(lo, hi, 0x01);
+
+        // Each Vector128.Create call gets half the operands. That is:
+        //   lo = Vector128.Create(op1, op2);
+        //   hi = Vector128.Create(op3, op4);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op3);
+        //   hi = Vector128.Create(op4,  ..., op7);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op7);
+        //   hi = Vector128.Create(op8,  ..., op15);
+        // -or-
+        //   lo = Vector128.Create(op1,  ..., op15);
+        //   hi = Vector128.Create(op16, ..., op31);
+
+        unsigned halfArgCnt = argCnt / 2;
+        assert((halfArgCnt * 2) == argCnt);
+
+        argList = op1->AsArgList();
+
+        for (unsigned i = 0; i < halfArgCnt; i++)
+        {
+            op2     = argList;
+            argList = argList->Rest();
+        }
+
+        op2->AsArgList()->gtOp2 = nullptr;
+        op2                     = argList;
+
+        // The above for loop splits the operand count into exactly half.
+        // Once it exits, op1 will point to op1 and op2 will point to the
+        // last operand that will be passed to the first Vector128.Create
+        // We will set its op2 to null, terminating the chain and then
+        // assign  op2 to be argList, which is the first operand that will
+        // get passed to the second Vector128.Create
+
+        GenTree* lo = nullptr;
+        GenTree* hi = nullptr;
+
+        if (halfArgCnt == 2)
+        {
+            // The Vector256.Create calls that take 4 operands are special
+            // because the half argument count is 2, which means we can't
+            // actually use the GT_LIST anymore and need to pass them as
+            // explicit operands instead.
+
+            argList = op1->AsArgList();
+
+            tmp1 = argList->Current();
+            tmp2 = argList->Rest()->Current();
+
+            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(tmp2, lo);
+            LowerNode(lo);
+
+            argList = op2->AsArgList();
+
+            tmp1 = argList->Current();
+            tmp2 = argList->Rest()->Current();
+
+            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertAfter(tmp2, hi);
+            LowerNode(hi);
+        }
+        else
+        {
+            // The rest of the Vector256.Create calls take at least 8 operands
+            // and so the half count is at least 4 and we have to continue
+            // passing around GT_LIST nodes in op1 with a null op2
+            assert(halfArgCnt >= 4);
+
+            tmp1 = op2->AsArgList()->Current();
+
+            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertBefore(tmp1, lo);
+            LowerNode(lo);
+
+            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_Create, baseType, 16);
+            BlockRange().InsertBefore(node, hi);
+            LowerNode(hi);
+        }
+
+        idx = comp->gtNewIconNode(0x01, TYP_INT);
+        BlockRange().InsertAfter(hi, idx);
+
+        node->gtOp1 = comp->gtNewArgList(lo, hi, idx);
+        node->gtOp2 = nullptr;
+
+        node->gtHWIntrinsicId = NI_AVX_InsertVector128;
+        return;
+    }
+
+    if (op1->OperIsList())
+    {
+        argList = op1->AsArgList();
+        op1     = argList->Current();
+        argList = argList->Rest();
+    }
+
+    // We will be constructing the following parts:
+    //          /--*  op1  T
+    //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+    //   ...
+
+    // This is roughly the following managed code:
+    //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
+    //   ...
+
+    tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+    BlockRange().InsertAfter(op1, tmp1);
+    LowerNode(tmp1);
+
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_UBYTE:
+        case TYP_SHORT:
+        case TYP_USHORT:
+        case TYP_INT:
+        case TYP_UINT:
+        {
+            unsigned       N            = 0;
+            GenTree*       opN          = nullptr;
+            NamedIntrinsic insIntrinsic = NI_Illegal;
+
+            if ((baseType == TYP_SHORT) || (baseType == TYP_USHORT))
+            {
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+                insIntrinsic = NI_SSE2_Insert;
+            }
+            else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+            {
+                insIntrinsic = NI_SSE41_Insert;
+            }
+
+            if (insIntrinsic != NI_Illegal)
+            {
+                for (N = 1; N < argCnt - 1; N++)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    N
+                    //          /--*  tmp1 simd16
+                    //          +--*  opN  T
+                    //          +--*  idx  int
+                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp1 = Sse?.Insert(tmp1, opN, N);
+                    //   ...
+
+                    opN = argList->Current();
+
+                    idx = comp->gtNewIconNode(N, TYP_INT);
+                    BlockRange().InsertAfter(opN, idx);
+
+                    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp1);
+                    LowerNode(tmp1);
+
+                    argList = argList->Rest();
+                }
+
+                assert(N == (argCnt - 1));
+
+                // We will be constructing the following parts:
+                //   idx  =    CNS_INT       int    N
+                //          /--*  tmp1 simd16
+                //          +--*  opN  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp1 = Sse?.Insert(tmp1, opN, N);
+                //   ...
+
+                opN = argList->Current();
+
+                idx = comp->gtNewIconNode(N, TYP_INT);
+                BlockRange().InsertAfter(opN, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, opN, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = insIntrinsic;
+                break;
+            }
+
+            assert((baseType != TYP_SHORT) && (baseType != TYP_USHORT));
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            GenTree* op[16];
+            op[0] = tmp1;
+
+            for (N = 1; N < argCnt; N++)
+            {
+                opN = argList->Current();
+
+                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, op[N]);
+                LowerNode(op[N]);
+
+                argList = argList->Rest();
+            }
+            assert(argList == nullptr);
+
+            if ((baseType == TYP_BYTE) || (baseType == TYP_UBYTE))
+            {
+                for (N = 0; N < argCnt; N += 4)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //          /--*  opN  T
+                    //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opO  T
+                    //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opN  simd16
+                    //          +--*  opO  simd16
+                    //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+                    //          /--*  opP  T
+                    //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opQ  T
+                    //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //          /--*  opP  simd16
+                    //          +--*  opQ  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+                    //          /--*  tmp1 simd16
+                    //          +--*  tmp2 simd16
+                    //   tmp3  = *  HWINTRINSIC   simd16 T UnpackLow
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp1 = Sse2.UnpackLow(opN, opO);
+                    //   tmp2 = Sse2.UnpackLow(opP, opQ);
+                    //   tmp3 = Sse2.UnpackLow(tmp1, tmp2);
+                    //   ...
+
+                    unsigned O = N + 1;
+                    unsigned P = N + 2;
+                    unsigned Q = N + 3;
+
+                    tmp1 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, op[N], op[O], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(op[O], tmp1);
+                    LowerNode(tmp1);
+
+                    tmp2 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, op[P], op[Q], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
+                    BlockRange().InsertAfter(op[Q], tmp2);
+                    LowerNode(tmp2);
+
+                    tmp3 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
+                    BlockRange().InsertAfter(tmp2, tmp3);
+                    LowerNode(tmp3);
+
+                    // This caches the result in index 0 through 3, depending on which
+                    // loop iteration this is and allows the rest of the logic to be
+                    // shared with the TYP_INT and TYP_UINT path.
+
+                    op[N / 4] = tmp3;
+                }
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  opN  T
+            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opO  T
+            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opN  simd16
+            //          +--*  opO  simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  opP  T
+            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opQ  T
+            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opP  simd16
+            //          +--*  opQ  simd16
+            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T UnpackLow
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Sse2.UnpackLow(opN, opO);
+            //   tmp2 = Sse2.UnpackLow(opP, opQ);
+            //   return Sse2.UnpackLow(tmp1, tmp2);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
+            BlockRange().InsertAfter(op[1], tmp1);
+            LowerNode(tmp1);
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
+            BlockRange().InsertAfter(op[3], tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+            node->gtSIMDBaseType  = TYP_ULONG;
+            break;
+        }
+
+#if defined(TARGET_AMD64)
+        case TYP_LONG:
+        case TYP_ULONG:
+        {
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64))
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //   idx  =    CNS_INT       int    1
+                //          /--*  tmp1 simd16
+                //          +--*  op2  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   return Sse41.X64.Insert(tmp1, op2, 0x01);
+
+                idx = comp->gtNewIconNode(0x01, TYP_INT);
+                BlockRange().InsertBefore(node, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE41_X64_Insert;
+                break;
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  op2  T
+            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T UnpackLow
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
+            //   return Sse2.UnpackLow(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+            BlockRange().InsertAfter(op2, tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
+            break;
+        }
+#endif // TARGET_AMD64
+
+        case TYP_FLOAT:
+        {
+            unsigned N   = 0;
+            GenTree* opN = nullptr;
+
+            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+            {
+                for (N = 1; N < argCnt - 1; N++)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //
+                    //          /--*  opN  T
+                    //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                    //   idx  =    CNS_INT       int    N
+                    //          /--*  tmp1 simd16
+                    //          +--*  opN  T
+                    //          +--*  idx  int
+                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Vector128.CreateScalarUnsafe(opN);
+                    //   tmp1 = Sse41.Insert(tmp1, tmp2, N << 4);
+                    //   ...
+
+                    opN = argList->Current();
+
+                    tmp2 =
+                        comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                    BlockRange().InsertAfter(opN, tmp2);
+                    LowerNode(tmp2);
+
+                    idx = comp->gtNewIconNode(N << 4, TYP_INT);
+                    BlockRange().InsertAfter(tmp2, idx);
+
+                    tmp1 =
+                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_SSE41_Insert, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp1);
+                    LowerNode(tmp1);
+
+                    argList = argList->Rest();
+                }
+
+                // We will be constructing the following parts:
+                //   ...
+                //
+                //          /--*  opN  T
+                //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+                //   idx  =    CNS_INT       int    N
+                //          /--*  tmp1 simd16
+                //          +--*  opN  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp2 = Vector128.CreateScalarUnsafe(opN);
+                //   return Sse41.Insert(tmp1, tmp2, N << 4);
+
+                opN = argList->Current();
+
+                tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, tmp2);
+                LowerNode(tmp2);
+
+                idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_SSE41_Insert;
+                break;
+            }
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  opN  T
+            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opO  T
+            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opN  simd16
+            //          +--*  opO  simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  opP  T
+            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opQ  T
+            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  opP  simd16
+            //          +--*  opQ  simd16
+            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Sse.UnpackLow(opN, opO);
+            //   tmp2 = Sse.UnpackLow(opP, opQ);
+            //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
+
+            GenTree* op[4];
+            op[0] = tmp1;
+
+            for (N = 1; N < argCnt; N++)
+            {
+                opN = argList->Current();
+
+                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+                BlockRange().InsertAfter(opN, op[N]);
+                LowerNode(op[N]);
+
+                argList = argList->Rest();
+            }
+            assert(argList == nullptr);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE_UnpackLow, baseType, simdSize);
+            BlockRange().InsertAfter(op[1], tmp1);
+            LowerNode(tmp1);
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE_UnpackLow, baseType, simdSize);
+            BlockRange().InsertAfter(op[3], tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+            break;
+        }
+
+        case TYP_DOUBLE:
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  op2  T
+            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
+
+            // This is roughly the following managed code:
+            //   ...
+            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
+            //   return Sse.MoveLowToHigh(tmp1, tmp2);
+
+            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
+            BlockRange().InsertAfter(op2, tmp2);
+            LowerNode(tmp2);
+
+            node->gtOp1 = tmp1;
+            node->gtOp2 = tmp2;
+
+            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
+            node->gtSIMDBaseType  = TYP_FLOAT;
+
+            break;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    ;
+    var_types baseType    = node->gtSIMDBaseType;
+    unsigned  simdSize    = node->gtSIMDSize;
+    var_types simdType    = Compiler::getSIMDTypeForSize(simdSize);
+    unsigned  simd16Count = comp->getSIMDVectorLength(16, baseType);
+
+    assert((intrinsicId == NI_Vector128_Dot) || (intrinsicId == NI_Vector256_Dot));
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+
+    assert(op1 != nullptr);
+    assert(op2 != nullptr);
+    assert(!op1->OperIsList());
+
+    // Spare GenTrees to be used for the lowering logic below
+    // Defined upfront to avoid naming conflicts, etc...
+    GenTree* idx  = nullptr;
+    GenTree* tmp1 = nullptr;
+    GenTree* tmp2 = nullptr;
+    GenTree* tmp3 = nullptr;
+
+    NamedIntrinsic multiply      = NI_Illegal;
+    NamedIntrinsic horizontalAdd = NI_Illegal;
+    NamedIntrinsic add           = NI_Illegal;
+    NamedIntrinsic shuffle       = NI_Illegal;
+
+    if (simdSize == 32)
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2));
+
+        switch (baseType)
+        {
+            case TYP_SHORT:
+            case TYP_USHORT:
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                multiply      = NI_AVX2_MultiplyLow;
+                horizontalAdd = NI_AVX2_HorizontalAdd;
+                add           = NI_AVX2_Add;
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                // We will be constructing the following parts:
+                //   idx  =    CNS_INT       int    0xF1
+                //          /--*  op1  simd16
+                //          +--*  op2  simd16
+                //          +--*  idx  int
+                //   tmp1 = *  HWINTRINSIC   simd16 T DotProduct
+                //          /--*  tmp1 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp1 =    LCL_VAR       simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    0x01
+                //          /--*  tmp2 simd16
+                //          +--*  idx  int
+                //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
+                //          /--*  tmp1 simd16
+                //          +--*  tmp2 simd16
+                //   tmp3 = *  HWINTRINSIC   simd16 T Add
+                //          /--*  tmp3 simd16
+                //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                // This is roughly the following managed code:
+                //   var tmp1 = Avx.DotProduct(op1, op2, 0xFF);
+                //   var tmp2 = Avx.ExtractVector128(tmp1, 0x01);
+                //   var tmp3 = Sse.Add(tmp1, tmp2);
+                //   return tmp3.ToScalar();
+
+                idx = comp->gtNewIconNode(0xF1, TYP_INT);
+                BlockRange().InsertBefore(node, idx);
+
+                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, baseType, simdSize);
+                BlockRange().InsertAfter(idx, tmp1);
+                LowerNode(tmp1);
+
+                node->gtOp1 = tmp1;
+                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp1Use);
+                tmp1 = node->gtOp1;
+
+                tmp2 = comp->gtClone(tmp1);
+                BlockRange().InsertAfter(tmp1, tmp2);
+
+                idx = comp->gtNewIconNode(0x01, TYP_INT);
+                BlockRange().InsertAfter(tmp2, idx);
+
+                tmp2 =
+                    comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
+                BlockRange().InsertAfter(idx, tmp2);
+                LowerNode(tmp2);
+
+                tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_SSE_Add, baseType, 16);
+                BlockRange().InsertAfter(tmp2, tmp3);
+                LowerNode(tmp3);
+
+                node->gtSIMDSize = 16;
+
+                node->gtOp1 = tmp3;
+                node->gtOp2 = nullptr;
+
+                node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                LowerNode(node);
+
+                return;
+            }
+
+            case TYP_DOUBLE:
+            {
+                multiply      = NI_AVX_Multiply;
+                horizontalAdd = NI_AVX_HorizontalAdd;
+                add           = NI_AVX_Add;
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+    }
+    else
+    {
+        assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+        switch (baseType)
+        {
+            case TYP_SHORT:
+            case TYP_USHORT:
+            {
+                multiply      = NI_SSE2_MultiplyLow;
+                horizontalAdd = NI_SSSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
+                {
+                    shuffle = NI_SSE2_ShuffleLow;
+                }
+                break;
+            }
+
+            case TYP_INT:
+            case TYP_UINT:
+            {
+                multiply      = NI_SSE41_MultiplyLow;
+                horizontalAdd = NI_SSSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
+                break;
+            }
+
+            case TYP_FLOAT:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // We will be constructing the following parts:
+                    //   idx  =    CNS_INT       int    0xFF
+                    //          /--*  op1  simd16
+                    //          +--*  op2  simd16
+                    //          +--*  idx  int
+                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
+                    //          /--*  tmp3 simd16
+                    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                    // This is roughly the following managed code:
+                    //   var tmp3 = Avx.DotProduct(op1, op2, 0xFF);
+                    //   return tmp3.ToScalar();
+
+                    if (simdSize == 8)
+                    {
+                        idx = comp->gtNewIconNode(0x31, TYP_INT);
+                    }
+                    else if (simdSize == 12)
+                    {
+                        idx = comp->gtNewIconNode(0x71, TYP_INT);
+                    }
+                    else
+                    {
+                        assert(simdSize == 16);
+                        idx = comp->gtNewIconNode(0xF1, TYP_INT);
+                    }
+                    BlockRange().InsertBefore(node, idx);
+
+                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
+                                                          simdSize);
+                    BlockRange().InsertAfter(idx, tmp3);
+                    LowerNode(tmp3);
+
+                    node->gtOp1 = tmp3;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                    LowerNode(node);
+
+                    return;
+                }
+
+                multiply      = NI_SSE_Multiply;
+                horizontalAdd = NI_SSE3_HorizontalAdd;
+                add           = NI_SSE_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    shuffle = NI_SSE_Shuffle;
+                }
+                break;
+            }
+
+            case TYP_DOUBLE:
+            {
+                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // We will be constructing the following parts:
+                    //   idx  =    CNS_INT       int    0x31
+                    //          /--*  op1  simd16
+                    //          +--*  op2  simd16
+                    //          +--*  idx  int
+                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
+                    //          /--*  tmp3 simd16
+                    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+                    // This is roughly the following managed code:
+                    //   var tmp3 = Avx.DotProduct(op1, op2, 0x31);
+                    //   return tmp3.ToScalar();
+
+                    idx = comp->gtNewIconNode(0x31, TYP_INT);
+                    BlockRange().InsertBefore(node, idx);
+
+                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
+                                                          simdSize);
+                    BlockRange().InsertAfter(idx, tmp3);
+                    LowerNode(tmp3);
+
+                    node->gtOp1 = tmp3;
+                    node->gtOp2 = nullptr;
+
+                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+                    LowerNode(node);
+
+                    return;
+                }
+
+                multiply      = NI_SSE2_Multiply;
+                horizontalAdd = NI_SSE3_HorizontalAdd;
+                add           = NI_SSE2_Add;
+
+                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                {
+                    shuffle = NI_SSE2_Shuffle;
+                }
+                break;
+            }
+
+            default:
+            {
+                unreached();
+            }
+        }
+
+        if (simdSize == 8)
+        {
+            assert(baseType == TYP_FLOAT);
+
+            // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength,
+            // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will
+            // be emitted rather than 2, so that the upper two elements will be ignored.
+
+            simd16Count = 2;
+        }
+        else if (simdSize == 12)
+        {
+            assert(baseType == TYP_FLOAT);
+
+            // We will be constructing the following parts:
+            //   ...
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    -1
+            //          +--*  CNS_INT    int    0
+            //   tmp1 = *  HWINTRINSIC   simd16 T Create
+            //          /--*  op2 simd16
+            //          +--*  tmp1 simd16
+            //   op1  = *  HWINTRINSIC   simd16 T And
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Vector128.Create(-1, -1, -1, 0);
+            //   op1  = Sse.And(op1, tmp2);
+            //   ...
+
+            GenTree* cns0 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(op1, cns0);
+
+            GenTree* cns1 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(cns0, cns1);
+
+            GenTree* cns2 = comp->gtNewIconNode(-1, TYP_INT);
+            BlockRange().InsertAfter(cns1, cns2);
+
+            GenTree* cns3 = comp->gtNewIconNode(0, TYP_INT);
+            BlockRange().InsertAfter(cns2, cns3);
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create, TYP_INT, 16);
+            BlockRange().InsertAfter(cns3, tmp1);
+            LowerNode(tmp1);
+
+            op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, tmp1, NI_SSE_And, baseType, simdSize);
+            BlockRange().InsertAfter(tmp1, op1);
+            LowerNode(op1);
+        }
+    }
+
+    // We will be constructing the following parts:
+    //          /--*  op1  simd16
+    //          +--*  op2  simd16
+    //   tmp1 = *  HWINTRINSIC   simd16 T Multiply
+    //   ...
+
+    // This is roughly the following managed code:
+    //   var tmp1 = Isa.Multiply(op1, op2);
+    //   ...
+
+    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize);
+    BlockRange().InsertBefore(node, tmp1);
+    LowerNode(tmp1);
+
+    // HorizontalAdd combines pairs so we need log2(simd16Count) passes to sum all elements together.
+    int haddCount = genLog2(simd16Count);
+
+    for (int i = 0; i < haddCount; i++)
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //          *  STORE_LCL_VAR simd16
+        //   tmp1 =    LCL_VAR       simd16
+        //   tmp2 =    LCL_VAR       simd16
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   tmp2 = tmp1;
+        //   ...
+
+        node->gtOp1 = tmp1;
+        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(tmp1Use);
+        tmp1 = node->gtOp1;
+
+        tmp2 = comp->gtClone(tmp1);
+        BlockRange().InsertAfter(tmp1, tmp2);
+
+        if (shuffle == NI_Illegal)
+        {
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T HorizontalAdd
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Isa.HorizontalAdd(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, horizontalAdd, baseType, simdSize);
+        }
+        else
+        {
+            int shuffleConst = 0x00;
+
+            switch (i)
+            {
+                case 0:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || varTypeIsFloating(baseType));
+
+                    // Adds (e0 + e1, e1 + e0, e2 + e3, e3 + e2), giving:
+                    //   e0, e1, e2, e3 | e4, e5, e6, e7
+                    //   e1, e0, e3, e2 | e5, e4, e7, e6
+                    //   ...
+
+                    shuffleConst = 0xB1;
+                    break;
+                }
+
+                case 1:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || (baseType == TYP_FLOAT));
+
+                    // Adds (e0 + e2, e1 + e3, e2 + e0, e3 + e1), giving:
+                    //   ...
+                    //   e2, e3, e0, e1 | e6, e7, e4, e5
+                    //   e3, e2, e1, e0 | e7, e6, e5, e4
+
+                    shuffleConst = 0x4E;
+                    break;
+                }
+
+                case 2:
+                {
+                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
+
+                    // Adds (e0 + e4, e1 + e5, e2 + e6, e3 + e7), giving:
+                    //   ...
+                    //   e4, e5, e6, e7 | e0, e1, e2, e3
+                    //   e5, e4, e7, e6 | e1, e0, e3, e2
+                    //   e6, e7, e4, e5 | e2, e3, e0, e1
+                    //   e7, e6, e5, e4 | e3, e2, e1, e0
+
+                    shuffleConst = 0x4D;
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                }
+            }
+
+            idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
+            BlockRange().InsertAfter(tmp2, idx);
+
+            if (varTypeIsFloating(baseType))
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //          /--*  tmp2 simd16
+                //          *  STORE_LCL_VAR simd16
+                //   tmp2 =    LCL_VAR       simd16
+                //   tmp3 =    LCL_VAR       simd16
+                //   idx  =    CNS_INT       int    shuffleConst
+                //          /--*  tmp2 simd16
+                //          +--*  tmp3 simd16
+                //          +--*  idx  simd16
+                //   tmp2 = *  HWINTRINSIC   simd16 T Shuffle
+                //   ...
+
+                // This is roughly the following managed code:
+                //   ...
+                //   tmp3 = tmp2;
+                //   tmp2 = Isa.Shuffle(tmp2, tmp3, shuffleConst);
+                //   ...
+
+                node->gtOp1 = tmp2;
+                LIR::Use tmp2Use(BlockRange(), &node->gtOp1, node);
+                ReplaceWithLclVar(tmp2Use);
+                tmp2 = node->gtOp1;
+
+                tmp3 = comp->gtClone(tmp2);
+                BlockRange().InsertAfter(tmp2, tmp3);
+
+                tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, shuffle, baseType, simdSize);
+            }
+            else
+            {
+                assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
+
+                if (i < 2)
+                {
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleHigh
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
+                    //   ...
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleLow, baseType, simdSize);
+                    BlockRange().InsertAfter(idx, tmp2);
+                    LowerNode(tmp2);
+
+                    idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
+                    BlockRange().InsertAfter(tmp2, idx);
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleHigh, baseType, simdSize);
+                }
+                else
+                {
+                    assert(i == 2);
+
+                    // We will be constructing the following parts:
+                    //   ...
+                    //   idx  =    CNS_INT       int    shuffleConst
+                    //          /--*  tmp2 simd16
+                    //          +--*  idx  simd16
+                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
+                    //   ...
+
+                    // This is roughly the following managed code:
+                    //   ...
+                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
+                    //   ...
+
+                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_Shuffle, TYP_INT, simdSize);
+                }
+            }
+
+            BlockRange().InsertAfter(idx, tmp2);
+            LowerNode(tmp2);
+
+            // We will be constructing the following parts:
+            //   ...
+            //          /--*  tmp1 simd16
+            //          +--*  tmp2 simd16
+            //   tmp1 = *  HWINTRINSIC   simd16 T Add
+            //   ...
+
+            // This is roughly the following managed code:
+            //   ...
+            //   tmp1 = Isa.Add(tmp1, tmp2);
+            //   ...
+
+            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, add, baseType, simdSize);
+        }
+
+        BlockRange().InsertAfter(tmp2, tmp1);
+        LowerNode(tmp1);
+    }
+
+    if (simdSize == 32)
+    {
+        // We will be constructing the following parts:
+        //   ...
+        //          /--*  tmp1 simd16
+        //          *  STORE_LCL_VAR simd16
+        //   tmp1 =    LCL_VAR       simd16
+        //   tmp2 =    LCL_VAR       simd16
+        //   idx  =    CNS_INT       int    0x01
+        //          /--*  tmp2 simd16
+        //          +--*  idx  int
+        //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
+        //          /--*  tmp1 simd16
+        //          +--*  tmp2 simd16
+        //   tmp1 = *  HWINTRINSIC   simd16 T Add
+        //   ...
+
+        // This is roughly the following managed code:
+        //   ...
+        //   var tmp2 = tmp1;
+        //       tmp2 = Avx.ExtractVector128(tmp2, 0x01);
+        //   var tmp1 = Isa.Add(tmp1, tmp2);
+        //   ...
+
+        node->gtOp1 = tmp1;
+        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
+        ReplaceWithLclVar(tmp1Use);
+        tmp1 = node->gtOp1;
+
+        tmp2 = comp->gtClone(tmp1);
+        BlockRange().InsertAfter(tmp1, tmp2);
+
+        idx = comp->gtNewIconNode(0x01, TYP_INT);
+        BlockRange().InsertAfter(tmp2, idx);
+
+        tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
+        BlockRange().InsertAfter(idx, tmp2);
+        LowerNode(tmp2);
+
+        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, add, baseType, 16);
+        BlockRange().InsertAfter(tmp2, tmp1);
+        LowerNode(tmp1);
+
+        node->gtSIMDSize = 16;
+    }
+
+    // We will be constructing the following parts:
+    //   ...
+    //          /--*  tmp1 simd16
+    //   node = *  HWINTRINSIC   simd16 T ToScalar
+
+    // This is roughly the following managed code:
+    //   ...
+    //   return tmp1.ToScalar();
+
+    node->gtOp1 = tmp1;
+    node->gtOp2 = nullptr;
+
+    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
+    LowerNode(node);
+
+    return;
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+    ;
+    var_types baseType = node->gtSIMDBaseType;
+    unsigned  simdSize = node->gtSIMDSize;
+    var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
+
+    assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar));
+    assert(varTypeIsSIMD(simdType));
+    assert(varTypeIsArithmetic(baseType));
+    assert(simdSize != 0);
+
+    switch (baseType)
+    {
+        case TYP_BYTE:
+        case TYP_SHORT:
+        case TYP_INT:
+        {
+            node->gtType          = TYP_INT;
+            node->gtSIMDBaseType  = TYP_INT;
+            node->gtHWIntrinsicId = NI_SSE2_ConvertToInt32;
+            break;
+        }
+
+        case TYP_UBYTE:
+        case TYP_USHORT:
+        case TYP_UINT:
+        {
+            node->gtType          = TYP_UINT;
+            node->gtSIMDBaseType  = TYP_UINT;
+            node->gtHWIntrinsicId = NI_SSE2_ConvertToUInt32;
+            break;
+        }
+
+#if defined(TARGET_AMD64)
+        case TYP_LONG:
+        {
+            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToInt64;
+            break;
+        }
+
+        case TYP_ULONG:
+        {
+            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToUInt64;
+            break;
+        }
+#endif // TARGET_AMD64
+
+        case TYP_FLOAT:
+        case TYP_DOUBLE:
+        {
+            ContainCheckHWIntrinsic(node);
+            return;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+
+    LowerNode(node);
+
+    if (genTypeSize(baseType) < 4)
+    {
+        LIR::Use use;
+        bool     foundUse = BlockRange().TryGetUse(node, &use);
+
+        GenTreeCast* cast = comp->gtNewCastNode(baseType, node, node->IsUnsigned(), baseType);
+        BlockRange().InsertAfter(node, cast);
+
+        if (foundUse)
+        {
+            use.ReplaceWith(comp, cast);
+        }
+        LowerNode(cast);
+    }
+}
+#endif // FEATURE_HW_INTRINSICS
+
+//----------------------------------------------------------------------------------------------
+// Lowering::IsRMWIndirCandidate:
+//    Returns true if the given operand is a candidate indirection for a read-modify-write
+//    operator.
+//
+//  Arguments:
+//     operand - The operand to consider.
+//     storeInd - The indirect store that roots the possible RMW operator.
+//
+bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
+{
+    // If the operand isn't an indirection, it's trivially not a candidate.
+    if (operand->OperGet() != GT_IND)
+    {
+        return false;
+    }
+
+    // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
+    // indirection is not a candidate.
+    GenTree* srcAddr = operand->gtGetOp1();
+    GenTree* dstAddr = storeInd->gtGetOp1();
+    if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
+    {
+        return false;
+    }
+
+    // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
+    // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
+    // indirection's tree is visited and check the side effects at each point.
+
+    m_scratchSideEffects.Clear();
+
+    assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
+    operand->gtLIRFlags |= LIR::Flags::Mark;
+
+    unsigned markCount = 1;
+    GenTree* node;
+    for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
+    {
+        assert(node != nullptr);
+
+        if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
+        {
+            m_scratchSideEffects.AddNode(comp, node);
+        }
+        else
+        {
+            node->gtLIRFlags &= ~LIR::Flags::Mark;
+            markCount--;
+
+            if (m_scratchSideEffects.InterferesWith(comp, node, false))
+            {
+                // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
+                // not a candidate. Clear any leftover mark bits and return.
+                for (; markCount > 0; node = node->gtPrev)
+                {
+                    if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
+                    {
+                        node->gtLIRFlags &= ~LIR::Flags::Mark;
+                        markCount--;
+                    }
+                }
+                return false;
+            }
+
+            node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
+                assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
+                nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
+                markCount++;
+                return GenTree::VisitResult::Continue;
+            });
+        }
+    }
+
+    // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
+    // destination address, and that it and the transitive closure of its operand can be safely contained by the
+    // storeIndir. This indirection is therefore a candidate for an RMW op.
+    return true;
+}
+
+//----------------------------------------------------------------------------------------------
+// Returns true if this tree is bin-op of a GT_STOREIND of the following form
+//      storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
+//      storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
+//
+// The above form for storeInd represents a read-modify-write memory binary operation.
+//
+// Parameters
+//     tree   -   GentreePtr of binOp
+//
+// Return Value
+//     True if 'tree' is part of a RMW memory operation pattern
+//
+bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
+{
+    // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
+    assert(!varTypeIsFloating(tree));
+    assert(GenTree::OperIsBinary(tree->OperGet()));
+
+    // Cheap bail out check before more expensive checks are performed.
+    // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
+    if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
+    {
+        return false;
+    }
+
+    LIR::Use use;
+    if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
+    {
+        return false;
+    }
+
+    // Since it is not relatively cheap to recognize RMW memory op pattern, we
+    // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
+    // we can use the result.
+    GenTree* indirCandidate = nullptr;
+    GenTree* indirOpSource  = nullptr;
+    return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
+}
+
+//----------------------------------------------------------------------------------------------
+// This method recognizes the case where we have a treeNode with the following structure:
+//         storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
+//         storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
+//         storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
+//
+// Terminology:
+//         indirDst = memory write of an addr mode  (i.e. storeind destination)
+//         indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
+//         indirCandidate = memory read i.e. a gtInd of an addr mode
+//         indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
+//
+// In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
+// following form in case of integer operations:
+//         binOp [addressing mode], RegIndirOpSource
+//         binOp [addressing mode], immediateVal
+// where RegIndirOpSource is the register where indirOpSource was computed.
+//
+// Right now, we recognize few cases:
+//     a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
+//     b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
+//     c) unaryOp is either not/neg
+//
+// Implementation Note: The following routines need to be in sync for RMW memory op optimization
+// to be correct and functional.
+//     IndirsAreEquivalent()
+//     NodesAreEquivalentLeaves()
+//     Codegen of GT_STOREIND and genCodeForShiftRMW()
+//     emitInsRMW()
+//
+//  TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
+//  package to perform more complex tree recognition.
+//
+//  TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
+//
+//  Parameters:
+//     tree               -  GT_STOREIND node
+//     outIndirCandidate  -  out param set to indirCandidate as described above
+//     ouutIndirOpSource  -  out param set to indirOpSource as described above
+//
+//  Return value
+//     True if there is a RMW memory operation rooted at a GT_STOREIND tree
+//     and out params indirCandidate and indirOpSource are set to non-null values.
+//     Otherwise, returns false with indirCandidate and indirOpSource set to null.
+//     Also updates flags of GT_STOREIND tree with its RMW status.
+//
+bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree** outIndirCandidate, GenTree** outIndirOpSource)
+{
+    assert(false);
+    return true;
+}
+
+// anything is in range for AMD64
+bool Lowering::IsCallTargetInRange(void* addr)
+{
+    return true;
+}
+
+// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
+bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode) const
+{
+    if (!childNode->IsIntCnsFitsInI32())
+    {
+        return false;
+    }
+
+    // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
+    // Icons that need relocation should never be marked as contained immed
+    if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------------------
+// PreferredRegOptionalOperand: returns one of the operands of given
+// binary oper that is to be preferred for marking as reg optional.
+//
+// Since only one of op1 or op2 can be a memory operand on xarch, only
+// one of  them have to be marked as reg optional.  Since Lower doesn't
+// know apriori which of op1 or op2 is not likely to get a register, it
+// has to make a guess. This routine encapsulates heuristics that
+// guess whether it is likely to be beneficial to mark op1 or op2 as
+// reg optional.
+//
+//
+// Arguments:
+//     tree  -  a binary-op tree node that is either commutative
+//              or a compare oper.
+//
+// Returns:
+//     Returns op1 or op2 of tree node that is preferred for
+//     marking as reg optional.
+//
+// Note: if the tree oper is neither commutative nor a compare oper
+// then only op2 can be reg optional on xarch and hence no need to
+// call this routine.
+GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
+{
+    assert(false);
+
+    return NULL;
+}
+
+//------------------------------------------------------------------------
+// Containment analysis
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// ContainCheckCallOperands: Determine whether operands of a call should be contained.
+//
+// Arguments:
+//    call       - The call node of interest
+//
+// Return Value:
+//    None.
+//
+void Lowering::ContainCheckCallOperands(GenTreeCall* call)
+{
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        // either gtControlExpr != null or gtCallAddr != null.
+        // Both cannot be non-null at the same time.
+        assert(ctrlExpr == nullptr);
+        assert(call->gtCallAddr != nullptr);
+        ctrlExpr = call->gtCallAddr;
+
+#ifdef TARGET_X86
+        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
+        // below that handles indirect VSD calls will need to be fixed.
+        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
+#endif // TARGET_X86
+    }
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        // we should never see a gtControlExpr whose type is void.
+        assert(ctrlExpr->TypeGet() != TYP_VOID);
+
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (!call->IsFastTailCall())
+        {
+#ifdef TARGET_X86
+            // On x86, we need to generate a very specific pattern for indirect VSD calls:
+            //
+            //    3-byte nop
+            //    call dword ptr [eax]
+            //
+            // Where EAX is also used as an argument to the stub dispatch helper. Make
+            // sure that the call target address is computed into EAX in this case.
+            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+            {
+                assert(ctrlExpr->isIndir());
+                MakeSrcContained(call, ctrlExpr);
+            }
+            else
+#endif // TARGET_X86
+                if (ctrlExpr->isIndir())
+            {
+                // We may have cases where we have set a register target on the ctrlExpr, but if it
+                // contained we must clear it.
+                ctrlExpr->SetRegNum(REG_NA);
+                MakeSrcContained(call, ctrlExpr);
+            }
+        }
+    }
+
+    for (GenTreeCall::Use& use : call->Args())
+    {
+        if (use.GetNode()->OperIs(GT_PUTARG_STK))
+        {
+            LowerPutArgStk(use.GetNode()->AsPutArgStk());
+        }
+    }
+
+    for (GenTreeCall::Use& use : call->LateArgs())
+    {
+        if (use.GetNode()->OperIs(GT_PUTARG_STK))
+        {
+            LowerPutArgStk(use.GetNode()->AsPutArgStk());
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// ContainCheckIndir: Determine whether operands of an indir should be contained.
+//
+// Arguments:
+//    node       - The indirection node of interest
+//
+// Notes:
+//    This is called for both store and load indirections. In the former case, it is assumed that
+//    LowerStoreIndir() has already been called to check for RMW opportunities.
+//
+// Return Value:
+//    None.
+//
+void Lowering::ContainCheckIndir(GenTreeIndir* node)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
+{
+    // If the source is a containable immediate, make it contained, unless it is
+    // an int-size or larger store of zero to memory, because we can generate smaller code
+    // by zeroing a register and then storing it.
+    GenTree* src = node->AsOp()->gtOp2;
+    if (IsContainableImmed(node, src) &&
+        (!src->IsIntegralConst(0) || varTypeIsSmall(node) || node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
+    {
+        MakeSrcContained(node, src);
+    }
+    ContainCheckIndir(node);
+}
+
+//------------------------------------------------------------------------
+// ContainCheckMul: determine whether the sources of a MUL node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckMul(GenTreeOp* node)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// ContainCheckDivOrMod: determine which operands of a div/mod should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckDivOrMod(GenTreeOp* node)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
+{
+    assert(node->OperIsShiftOrRotate());
+#ifdef TARGET_X86
+    GenTree* source = node->gtOp1;
+    if (node->OperIsShiftLong())
+    {
+        assert(source->OperGet() == GT_LONG);
+        MakeSrcContained(node, source);
+    }
+#endif // !TARGET_X86
+
+    GenTree* shiftBy = node->gtOp2;
+    if (IsContainableImmed(node, shiftBy) && (shiftBy->AsIntConCommon()->IconValue() <= 255) &&
+        (shiftBy->AsIntConCommon()->IconValue() >= 0))
+    {
+        MakeSrcContained(node, shiftBy);
+    }
+}
+
+//------------------------------------------------------------------------
+// ContainCheckStoreLoc: determine whether the source of a STORE_LCL* should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc) const
+{
+    assert(storeLoc->OperIsLocalStore());
+    GenTree* op1 = storeLoc->gtGetOp1();
+
+    if (op1->OperIs(GT_BITCAST))
+    {
+        // If we know that the source of the bitcast will be in a register, then we can make
+        // the bitcast itself contained. This will allow us to store directly from the other
+        // type if this node doesn't get a register.
+        GenTree* bitCastSrc = op1->gtGetOp1();
+        if (!bitCastSrc->isContained() && !bitCastSrc->IsRegOptional())
+        {
+            op1->SetContained();
+            return;
+        }
+    }
+
+    const LclVarDsc* varDsc = comp->lvaGetDesc(storeLoc);
+
+#ifdef FEATURE_SIMD
+    if (varTypeIsSIMD(storeLoc))
+    {
+        assert(!op1->IsCnsIntOrI());
+        if (storeLoc->TypeIs(TYP_SIMD12) && op1->IsSIMDZero() && varDsc->lvDoNotEnregister)
+        {
+            // For a SIMD12 store we can zero from integer registers more easily.
+            MakeSrcContained(storeLoc, op1);
+            GenTree* constNode = op1->gtGetOp1();
+            assert(constNode->OperIsConst());
+            constNode->ClearContained();
+            constNode->gtType = TYP_INT;
+            constNode->SetOper(GT_CNS_INT);
+        }
+        return;
+    }
+#endif // FEATURE_SIMD
+
+    // If the source is a containable immediate, make it contained, unless it is
+    // an int-size or larger store of zero to memory, because we can generate smaller code
+    // by zeroing a register and then storing it.
+    var_types type = varDsc->GetRegisterType(storeLoc);
+    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(type)))
+    {
+        MakeSrcContained(storeLoc, op1);
+    }
+#ifdef TARGET_X86
+    else if (op1->OperGet() == GT_LONG)
+    {
+        MakeSrcContained(storeLoc, op1);
+    }
+#endif // TARGET_X86
+}
+
+//------------------------------------------------------------------------
+// ContainCheckCast: determine whether the source of a CAST node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckCast(GenTreeCast* node)
+{
+    GenTree*  castOp     = node->CastOp();
+    var_types castToType = node->CastToType();
+    var_types srcType    = castOp->TypeGet();
+
+    // force the srcType to unsigned if GT_UNSIGNED flag is set
+    if (node->gtFlags & GTF_UNSIGNED)
+    {
+        srcType = genUnsignedType(srcType);
+    }
+
+    if (!node->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(srcType)))
+    {
+#ifdef DEBUG
+        // If converting to float/double, the operand must be 4 or 8 byte in size.
+        if (varTypeIsFloating(castToType))
+        {
+            unsigned opSize = genTypeSize(srcType);
+            assert(opSize == 4 || opSize == 8);
+        }
+#endif // DEBUG
+
+        // U8 -> R8 conversion requires that the operand be in a register.
+        if (srcType != TYP_ULONG)
+        {
+            if (IsContainableMemoryOp(castOp) || castOp->IsCnsNonZeroFltOrDbl())
+            {
+                MakeSrcContained(node, castOp);
+            }
+            else
+            {
+                // Mark castOp as reg optional to indicate codegen
+                // can still generate code if it is on stack.
+                castOp->SetRegOptional();
+            }
+        }
+    }
+#if !defined(TARGET_64BIT)
+    if (varTypeIsLong(srcType))
+    {
+        noway_assert(castOp->OperGet() == GT_LONG);
+        castOp->SetContained();
+    }
+#endif // !defined(TARGET_64BIT)
+}
+
+//------------------------------------------------------------------------
+// ContainCheckCompare: determine whether the sources of a compare node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckCompare(GenTreeOp* cmp)
+{
+    assert(cmp->OperIsCompare() || cmp->OperIs(GT_CMP));
+
+    GenTree*  op1     = cmp->AsOp()->gtOp1;
+    GenTree*  op2     = cmp->AsOp()->gtOp2;
+    var_types op1Type = op1->TypeGet();
+    var_types op2Type = op2->TypeGet();
+
+    // If either of op1 or op2 is floating point values, then we need to use
+    // ucomiss or ucomisd to compare, both of which support the following form:
+    //     ucomis[s|d] xmm, xmm/mem
+    // That is only the second operand can be a memory op.
+    //
+    // Second operand is a memory Op:  Note that depending on comparison operator,
+    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
+    // op2 can be a memory op depending on the comparison operator.
+    if (varTypeIsFloating(op1Type))
+    {
+        // The type of the operands has to be the same and no implicit conversions at this stage.
+        assert(op1Type == op2Type);
+
+        GenTree* otherOp;
+        if (GenCondition::FromFloatRelop(cmp).PreferSwap())
+        {
+            otherOp = op1;
+        }
+        else
+        {
+            otherOp = op2;
+        }
+
+        assert(otherOp != nullptr);
+        bool isSafeToContainOtherOp = true;
+        if (otherOp->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(cmp, otherOp);
+        }
+        else if (IsContainableMemoryOp(otherOp))
+        {
+            isSafeToContainOtherOp = IsSafeToContainMem(cmp, otherOp);
+            if (isSafeToContainOtherOp)
+            {
+                MakeSrcContained(cmp, otherOp);
+            }
+        }
+
+        if (!otherOp->isContained() && isSafeToContainOtherOp && IsSafeToContainMem(cmp, otherOp))
+        {
+            // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
+            // contained, we can mark it reg-optional.
+            // IsSafeToContainMem is expensive so we call it at most once for otherOp.
+            // If we already called IsSafeToContainMem, it must have returned false;
+            // otherwise, otherOp would be contained.
+            otherOp->SetRegOptional();
+        }
+
+        return;
+    }
+
+    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
+    // or in other backend.
+
+    if (CheckImmedAndMakeContained(cmp, op2))
+    {
+        // If the types are the same, or if the constant is of the correct size,
+        // we can treat the MemoryOp as contained.
+        if (op1Type == op2Type)
+        {
+            if (IsContainableMemoryOp(op1))
+            {
+                MakeSrcContained(cmp, op1);
+            }
+            else
+            {
+                op1->SetRegOptional();
+            }
+        }
+    }
+    else if (op1Type == op2Type)
+    {
+        // Note that TEST does not have a r,rm encoding like CMP has but we can still
+        // contain the second operand because the emitter maps both r,rm and rm,r to
+        // the same instruction code. This avoids the need to special case TEST here.
+
+        bool isSafeToContainOp1 = true;
+        bool isSafeToContainOp2 = true;
+
+        if (IsContainableMemoryOp(op2))
+        {
+            isSafeToContainOp2 = IsSafeToContainMem(cmp, op2);
+            if (isSafeToContainOp2)
+            {
+                MakeSrcContained(cmp, op2);
+            }
+        }
+
+        if (!op2->isContained() && IsContainableMemoryOp(op1))
+        {
+            isSafeToContainOp1 = IsSafeToContainMem(cmp, op1);
+            if (isSafeToContainOp1)
+            {
+                MakeSrcContained(cmp, op1);
+            }
+        }
+
+        if (!op1->isContained() && !op2->isContained())
+        {
+            // One of op1 or op2 could be marked as reg optional
+            // to indicate that codegen can still generate code
+            // if one of them is on stack.
+            GenTree* regOptionalCandidate = op1->IsCnsIntOrI() ? op2 : PreferredRegOptionalOperand(cmp);
+
+            // IsSafeToContainMem is expensive so we call it at most once for each operand
+            // in this method. If we already called IsSafeToContainMem, it must have returned false;
+            // otherwise, the corresponding operand (op1 or op2) would be contained.
+            bool setRegOptional = (regOptionalCandidate == op1) ? isSafeToContainOp1 && IsSafeToContainMem(cmp, op1)
+                                                                : isSafeToContainOp2 && IsSafeToContainMem(cmp, op2);
+            if (setRegOptional)
+            {
+                regOptionalCandidate->SetRegOptional();
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
+//
+// Arguments:
+//    node       - The indirect store node (GT_STORE_IND) of interest
+//
+// Return Value:
+//    Returns true if 'node' is a valid RMW mem op; false otherwise.
+//
+bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
+{
+    assert(false);
+    return false;
+}
+
+//------------------------------------------------------------------------
+// ContainCheckBinary: Determine whether a binary op's operands should be contained.
+//
+// Arguments:
+//    node - the node we care about
+//
+void Lowering::ContainCheckBinary(GenTreeOp* node)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
+{
+    assert(node->OperIsBoundsCheck());
+    GenTree* other;
+    if (CheckImmedAndMakeContained(node, node->gtIndex))
+    {
+        other = node->gtArrLen;
+    }
+    else if (CheckImmedAndMakeContained(node, node->gtArrLen))
+    {
+        other = node->gtIndex;
+    }
+    else if (IsContainableMemoryOp(node->gtIndex))
+    {
+        other = node->gtIndex;
+    }
+    else
+    {
+        other = node->gtArrLen;
+    }
+
+    if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
+    {
+        if (IsContainableMemoryOp(other))
+        {
+            MakeSrcContained(node, other);
+        }
+        else
+        {
+            // We can mark 'other' as reg optional, since it is not contained.
+            other->SetRegOptional();
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
+{
+    assert(node->OperIs(GT_INTRINSIC));
+
+    NamedIntrinsic intrinsicName = node->AsIntrinsic()->gtIntrinsicName;
+
+    if (intrinsicName == NI_System_Math_Sqrt || intrinsicName == NI_System_Math_Round ||
+        intrinsicName == NI_System_Math_Ceiling || intrinsicName == NI_System_Math_Floor)
+    {
+        GenTree* op1 = node->gtGetOp1();
+        if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(node, op1);
+        }
+        else
+        {
+            // Mark the operand as reg optional since codegen can still
+            // generate code if op1 is on stack.
+            op1->SetRegOptional();
+        }
+    }
+}
+
+#ifdef FEATURE_SIMD
+//----------------------------------------------------------------------------------------------
+// ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
+//
+//  Arguments:
+//     simdNode - The SIMD intrinsic node.
+//
+void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
+{
+    switch (simdNode->gtSIMDIntrinsicID)
+    {
+        GenTree* op1;
+        GenTree* op2;
+
+        case SIMDIntrinsicInit:
+        {
+            op1 = simdNode->AsOp()->gtOp1;
+#ifndef TARGET_64BIT
+            if (op1->OperGet() == GT_LONG)
+            {
+                MakeSrcContained(simdNode, op1);
+                GenTree* op1lo = op1->gtGetOp1();
+                GenTree* op1hi = op1->gtGetOp2();
+
+                if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+                    (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+                {
+                    MakeSrcContained(op1, op1lo);
+                    MakeSrcContained(op1, op1hi);
+                }
+            }
+            else
+#endif // !TARGET_64BIT
+                if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+                    (varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+            {
+                MakeSrcContained(simdNode, op1);
+            }
+            else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
+                     ((simdNode->gtSIMDSize == 16) || (simdNode->gtSIMDSize == 32)))
+            {
+                // Either op1 is a float or dbl constant or an addr
+                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
+                {
+                    MakeSrcContained(simdNode, op1);
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicInitArray:
+            // We have an array and an index, which may be contained.
+            CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
+            break;
+
+        case SIMDIntrinsicGetItem:
+        {
+            // This implements get_Item method. The sources are:
+            //  - the source SIMD struct
+            //  - index (which element to get)
+            // The result is baseType of SIMD struct.
+            op1 = simdNode->AsOp()->gtOp1;
+            op2 = simdNode->AsOp()->gtOp2;
+
+            if (op1->OperGet() == GT_IND)
+            {
+                assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
+                op1->AsIndir()->Addr()->ClearContained();
+            }
+            // If the index is a constant, mark it as contained.
+            CheckImmedAndMakeContained(simdNode, op2);
+
+            if (IsContainableMemoryOp(op1))
+            {
+                MakeSrcContained(simdNode, op1);
+                if (op1->OperGet() == GT_IND)
+                {
+                    op1->AsIndir()->Addr()->ClearContained();
+                }
+            }
+        }
+        break;
+
+        case SIMDIntrinsicShuffleSSE2:
+            // Second operand is an integer constant and marked as contained.
+            assert(simdNode->AsOp()->gtOp2->IsCnsIntOrI());
+            MakeSrcContained(simdNode, simdNode->AsOp()->gtOp2);
+            break;
+
+        default:
+            break;
+    }
+}
+#endif // FEATURE_SIMD
+
+#ifdef FEATURE_HW_INTRINSICS
+//----------------------------------------------------------------------------------------------
+// IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
+//
+//  Arguments:
+//     containingNode - The hardware intrinsic node which contains 'node'
+//     node - The node to check
+//     [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands;
+//     otherwise, false.
+//
+// Return Value:
+//    true if 'node' is a containable hardware intrinsic node; otherwise, false.
+//
+bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node, bool* supportsRegOptional)
+{
+    NamedIntrinsic      containingIntrinsicId = containingNode->gtHWIntrinsicId;
+    HWIntrinsicCategory category              = HWIntrinsicInfo::lookupCategory(containingIntrinsicId);
+
+    // We shouldn't have called in here if containingNode doesn't support containment
+    assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId));
+
+    // containingNode supports nodes that read from an aligned memory address
+    //
+    // This will generally be an explicit LoadAligned instruction and is false for
+    // machines with VEX support when minOpts is enabled. This is because there is
+    // currently no way to guarantee that the address read from will always be
+    // aligned and we want to assert that the address is aligned when optimizations
+    // aren't enabled. However, when optimizations are enabled, we want to allow
+    // folding of memory operands as it produces better codegen and allows simpler
+    // coding patterns on the managed side.
+    bool supportsAlignedSIMDLoads = false;
+
+    // containingNode supports nodes that read from general memory
+    //
+    // We currently have to assume all "general" loads are unaligned. As such, this is
+    // generally used to determine if we can mark the node as `regOptional` in the case
+    // where `node` is not containable. However, this can also be used to determine whether
+    // we can mark other types of reads as contained (such as when directly reading a local).
+    bool supportsGeneralLoads = false;
+
+    // containingNode supports nodes that read from a scalar memory address
+    //
+    // This will generally be an explicit LoadScalar instruction but is also used to determine
+    // whether we can read an address of type T (we don't support this when the load would
+    // read more than sizeof(T) bytes).
+    bool supportsSIMDScalarLoads = false;
+
+    // containingNode supports nodes that read from an unaligned memory address
+    //
+    // This will generally be an explicit Load instruction and is generally false for machines
+    // without VEX support. This is because older hardware required that the SIMD operand always
+    // be aligned to the 'natural alignment' of the type.
+    bool supportsUnalignedSIMDLoads = false;
+
+    switch (category)
+    {
+        case HW_Category_MemoryLoad:
+            supportsGeneralLoads = (!node->OperIsHWIntrinsic());
+            break;
+
+        case HW_Category_SimpleSIMD:
+        {
+            switch (containingIntrinsicId)
+            {
+                case NI_SSE41_ConvertToVector128Int16:
+                case NI_SSE41_ConvertToVector128Int32:
+                case NI_SSE41_ConvertToVector128Int64:
+                case NI_AVX2_ConvertToVector256Int16:
+                case NI_AVX2_ConvertToVector256Int32:
+                case NI_AVX2_ConvertToVector256Int64:
+                {
+                    supportsGeneralLoads = (!node->OperIsHWIntrinsic());
+                    break;
+                }
+
+                default:
+                {
+                    // These intrinsics only expect 16 or 32-byte nodes for containment
+                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
+
+                    if (!comp->canUseVexEncoding())
+                    {
+                        // Most instructions under the non-VEX encoding require aligned operands.
+                        // Those used for Sse2.ConvertToVector128Double (CVTDQ2PD and CVTPS2PD)
+                        // are exceptions and don't fail for unaligned inputs.
+
+                        supportsAlignedSIMDLoads   = (containingIntrinsicId != NI_SSE2_ConvertToVector128Double);
+                        supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
+                    }
+                    else
+                    {
+                        supportsAlignedSIMDLoads   = !comp->opts.MinOpts();
+                        supportsUnalignedSIMDLoads = true;
+                    }
+
+                    supportsGeneralLoads = supportsUnalignedSIMDLoads;
+                    break;
+                }
+            }
+
+            assert(supportsSIMDScalarLoads == false);
+            break;
+        }
+
+        case HW_Category_IMM:
+        {
+            switch (containingIntrinsicId)
+            {
+                case NI_SSE_Shuffle:
+                case NI_SSE2_ShiftLeftLogical:
+                case NI_SSE2_ShiftRightArithmetic:
+                case NI_SSE2_ShiftRightLogical:
+                case NI_SSE2_Shuffle:
+                case NI_SSE2_ShuffleHigh:
+                case NI_SSE2_ShuffleLow:
+                case NI_SSSE3_AlignRight:
+                case NI_SSE41_Blend:
+                case NI_SSE41_DotProduct:
+                case NI_SSE41_MultipleSumAbsoluteDifferences:
+                case NI_AES_KeygenAssist:
+                case NI_PCLMULQDQ_CarrylessMultiply:
+                case NI_AVX_Blend:
+                case NI_AVX_Compare:
+                case NI_AVX_DotProduct:
+                case NI_AVX_InsertVector128:
+                case NI_AVX_Permute:
+                case NI_AVX_Permute2x128:
+                case NI_AVX2_Blend:
+                case NI_AVX2_InsertVector128:
+                case NI_AVX2_MultipleSumAbsoluteDifferences:
+                case NI_AVX2_Permute2x128:
+                case NI_AVX2_Permute4x64:
+                case NI_AVX2_ShiftLeftLogical:
+                case NI_AVX2_ShiftRightArithmetic:
+                case NI_AVX2_ShiftRightLogical:
+                case NI_AVX2_ShuffleHigh:
+                case NI_AVX2_ShuffleLow:
+                {
+                    // These intrinsics only expect 16 or 32-byte nodes for containment
+                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
+                    assert(supportsSIMDScalarLoads == false);
+
+                    supportsAlignedSIMDLoads   = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
+                    supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
+                    supportsGeneralLoads       = supportsUnalignedSIMDLoads;
+
+                    break;
+                }
+
+                case NI_SSE2_Insert:
+                case NI_SSE41_Insert:
+                case NI_SSE41_X64_Insert:
+                {
+                    if (containingNode->gtSIMDBaseType == TYP_FLOAT)
+                    {
+                        assert(containingIntrinsicId == NI_SSE41_Insert);
+                        assert(genTypeSize(node->TypeGet()) == 16);
+
+                        // Sse41.Insert(V128<float>, V128<float>, byte) is a bit special
+                        // in that it has different behavior depending on whether the
+                        // second operand is coming from a register or memory. When coming
+                        // from a register, all 4 elements of the vector can be used and it
+                        // is effectively a regular `SimpleSIMD` operation; but when loading
+                        // from memory, it only works with the lowest element and is effectively
+                        // a `SIMDScalar`.
+
+                        assert(supportsAlignedSIMDLoads == false);
+                        assert(supportsUnalignedSIMDLoads == false);
+                        assert(supportsGeneralLoads == false);
+                        assert(supportsSIMDScalarLoads == false);
+
+                        GenTree* op1 = containingNode->gtGetOp1();
+                        GenTree* op2 = nullptr;
+                        GenTree* op3 = nullptr;
+
+                        assert(op1->OperIsList());
+                        assert(containingNode->gtGetOp2() == nullptr);
+
+                        GenTreeArgList* argList = op1->AsArgList();
+
+                        op1     = argList->Current();
+                        argList = argList->Rest();
+
+                        op2     = argList->Current();
+                        argList = argList->Rest();
+
+                        assert(node == op2);
+
+                        op3 = argList->Current();
+
+                        // The upper two bits of the immediate value are ignored if
+                        // op2 comes from memory. In order to support using the upper
+                        // bits, we need to disable containment support if op3 is not
+                        // constant or if the constant is greater than 0x3F (which means
+                        // at least one of the upper two bits is set).
+
+                        if (op3->IsCnsIntOrI())
+                        {
+                            ssize_t ival = op3->AsIntCon()->IconValue();
+                            assert((ival >= 0) && (ival <= 255));
+
+                            supportsSIMDScalarLoads = (ival <= 0x3F);
+                            supportsGeneralLoads    = supportsSIMDScalarLoads;
+                        }
+                        break;
+                    }
+
+                    // We should only get here for integral nodes.
+                    assert(varTypeIsIntegral(node->TypeGet()));
+
+                    assert(supportsAlignedSIMDLoads == false);
+                    assert(supportsUnalignedSIMDLoads == false);
+                    assert(supportsSIMDScalarLoads == false);
+
+                    const unsigned expectedSize = genTypeSize(containingNode->gtSIMDBaseType);
+                    const unsigned operandSize  = genTypeSize(node->TypeGet());
+
+                    supportsGeneralLoads = (operandSize >= expectedSize);
+                    break;
+                }
+
+                case NI_AVX_CompareScalar:
+                {
+                    // These intrinsics only expect 16 or 32-byte nodes for containment
+                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
+
+                    assert(supportsAlignedSIMDLoads == false);
+                    assert(supportsUnalignedSIMDLoads == false);
+
+                    supportsSIMDScalarLoads = true;
+                    supportsGeneralLoads    = supportsSIMDScalarLoads;
+                    break;
+                }
+
+                default:
+                {
+                    assert(supportsAlignedSIMDLoads == false);
+                    assert(supportsGeneralLoads == false);
+                    assert(supportsSIMDScalarLoads == false);
+                    assert(supportsUnalignedSIMDLoads == false);
+                    break;
+                }
+            }
+            break;
+        }
+
+        case HW_Category_SIMDScalar:
+        {
+            assert(supportsAlignedSIMDLoads == false);
+            assert(supportsUnalignedSIMDLoads == false);
+
+            switch (containingIntrinsicId)
+            {
+                case NI_Vector128_CreateScalarUnsafe:
+                case NI_Vector256_CreateScalarUnsafe:
+                {
+                    assert(supportsSIMDScalarLoads == false);
+
+                    const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
+                    const unsigned operandSize  = genTypeSize(node->TypeGet());
+
+                    supportsGeneralLoads = (operandSize == expectedSize);
+                    break;
+                }
+
+                case NI_AVX2_BroadcastScalarToVector128:
+                case NI_AVX2_BroadcastScalarToVector256:
+                {
+                    // The memory form of this already takes a pointer, and cannot be further contained.
+                    // The containable form is the one that takes a SIMD value, that may be in memory.
+                    supportsGeneralLoads = (node->TypeGet() == TYP_SIMD16);
+                    break;
+                }
+
+                case NI_SSE_ConvertScalarToVector128Single:
+                case NI_SSE2_ConvertScalarToVector128Double:
+                case NI_SSE2_ConvertScalarToVector128Int32:
+                case NI_SSE2_ConvertScalarToVector128UInt32:
+                case NI_SSE_X64_ConvertScalarToVector128Single:
+                case NI_SSE2_X64_ConvertScalarToVector128Double:
+                case NI_SSE2_X64_ConvertScalarToVector128Int64:
+                case NI_SSE2_X64_ConvertScalarToVector128UInt64:
+                {
+                    if (!varTypeIsIntegral(node->TypeGet()))
+                    {
+                        // The floating-point overload doesn't require any special semantics
+                        assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double);
+                        supportsSIMDScalarLoads = true;
+                        supportsGeneralLoads    = supportsSIMDScalarLoads;
+                        break;
+                    }
+
+                    assert(supportsSIMDScalarLoads == false);
+
+                    const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
+                    const unsigned operandSize  = genTypeSize(node->TypeGet());
+
+                    supportsGeneralLoads = (operandSize == expectedSize);
+                    break;
+                }
+
+                default:
+                {
+                    // These intrinsics only expect 16 or 32-byte nodes for containment
+                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
+
+                    supportsSIMDScalarLoads = true;
+                    supportsGeneralLoads    = supportsSIMDScalarLoads;
+                    break;
+                }
+            }
+            break;
+        }
+
+        case HW_Category_Scalar:
+        {
+            // We should only get here for integral nodes.
+            assert(varTypeIsIntegral(node->TypeGet()));
+
+            assert(supportsAlignedSIMDLoads == false);
+            assert(supportsUnalignedSIMDLoads == false);
+            assert(supportsSIMDScalarLoads == false);
+
+            unsigned       expectedSize = genTypeSize(containingNode->TypeGet());
+            const unsigned operandSize  = genTypeSize(node->TypeGet());
+
+            // CRC32 codegen depends on its second oprand's type.
+            // Currently, we are using SIMDBaseType to store the op2Type info.
+            if (containingIntrinsicId == NI_SSE42_Crc32)
+            {
+                var_types op2Type = containingNode->gtSIMDBaseType;
+                expectedSize      = genTypeSize(op2Type);
+            }
+
+            supportsGeneralLoads = (operandSize >= expectedSize);
+            break;
+        }
+
+        default:
+        {
+            assert(supportsAlignedSIMDLoads == false);
+            assert(supportsGeneralLoads == false);
+            assert(supportsSIMDScalarLoads == false);
+            assert(supportsUnalignedSIMDLoads == false);
+            break;
+        }
+    }
+
+    noway_assert(supportsRegOptional != nullptr);
+    *supportsRegOptional = supportsGeneralLoads;
+
+    if (!node->OperIsHWIntrinsic())
+    {
+        return supportsGeneralLoads && IsContainableMemoryOp(node);
+    }
+
+    // TODO-XArch: Update this to be table driven, if possible.
+
+    NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->gtHWIntrinsicId;
+
+    switch (intrinsicId)
+    {
+        case NI_SSE_LoadAlignedVector128:
+        case NI_SSE2_LoadAlignedVector128:
+        case NI_AVX_LoadAlignedVector256:
+        {
+            return supportsAlignedSIMDLoads;
+        }
+
+        case NI_SSE_LoadScalarVector128:
+        case NI_SSE2_LoadScalarVector128:
+        {
+            return supportsSIMDScalarLoads;
+        }
+
+        case NI_SSE_LoadVector128:
+        case NI_SSE2_LoadVector128:
+        case NI_AVX_LoadVector256:
+        {
+            return supportsUnalignedSIMDLoads;
+        }
+
+        case NI_AVX_ExtractVector128:
+        case NI_AVX2_ExtractVector128:
+        {
+            return false;
+        }
+
+        default:
+        {
+            assert(!node->isContainableHWIntrinsic());
+            return false;
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------------------
+// ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware
+//                              intrinsic node.
+//
+//  Arguments:
+//     node - The hardware intrinsic node
+//     addr - The address node to try contain
+//
+void Lowering::ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr)
+{
+    assert((addr->TypeGet() == TYP_I_IMPL) || (addr->TypeGet() == TYP_BYREF));
+    TryCreateAddrMode(addr, true);
+    if ((addr->OperIs(GT_CLS_VAR_ADDR, GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR, GT_LEA) ||
+         (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))) &&
+        IsSafeToContainMem(node, addr))
+    {
+        MakeSrcContained(node, addr);
+    }
+}
+
+//----------------------------------------------------------------------------------------------
+// ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
+//
+//  Arguments:
+//     node - The hardware intrinsic node.
+//
+void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
+{
+    NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
+    HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
+    int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
+    var_types           baseType    = node->gtSIMDBaseType;
+    unsigned            simdSize    = node->gtSIMDSize;
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+    GenTree* op3 = nullptr;
+
+    if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
+    {
+        // AVX2 gather are not containable and always have constant IMM argument
+        if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
+        {
+            GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
+            assert(lastOp != nullptr);
+            MakeSrcContained(node, lastOp);
+        }
+        // Exit early if containment isn't supported
+        return;
+    }
+
+    if (HWIntrinsicInfo::lookupCategory(intrinsicId) == HW_Category_IMM)
+    {
+        GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
+        assert(lastOp != nullptr);
+
+        if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
+        {
+            MakeSrcContained(node, lastOp);
+        }
+    }
+
+    if ((node->gtSIMDSize == 8) || (node->gtSIMDSize == 12))
+    {
+        // TODO-XArch-CQ: Ideally we would key this off of the size containingNode
+        // expects vs the size node actually is or would be if spilled to the stack
+        return;
+    }
+
+    // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
+
+    const bool isCommutative = HWIntrinsicInfo::IsCommutative(intrinsicId);
+
+    if (numArgs == 1)
+    {
+        // One argument intrinsics cannot be commutative
+        assert(!isCommutative);
+
+        assert(!op1->OperIsList());
+        assert(op2 == nullptr);
+
+        switch (category)
+        {
+            case HW_Category_MemoryLoad:
+                ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
+                break;
+
+            case HW_Category_SimpleSIMD:
+            case HW_Category_SIMDScalar:
+            case HW_Category_Scalar:
+            {
+                switch (intrinsicId)
+                {
+                    case NI_SSE_ReciprocalScalar:
+                    case NI_SSE_ReciprocalSqrtScalar:
+                    case NI_SSE_SqrtScalar:
+                    case NI_SSE2_SqrtScalar:
+                    case NI_SSE41_CeilingScalar:
+                    case NI_SSE41_FloorScalar:
+                    case NI_SSE41_RoundCurrentDirectionScalar:
+                    case NI_SSE41_RoundToNearestIntegerScalar:
+                    case NI_SSE41_RoundToNegativeInfinityScalar:
+                    case NI_SSE41_RoundToPositiveInfinityScalar:
+                    case NI_SSE41_RoundToZeroScalar:
+                    {
+                        // These intrinsics have both 1 and 2-operand overloads.
+                        //
+                        // The 1-operand overload basically does `intrinsic(op1, op1)`
+                        //
+                        // Because of this, the operand must be loaded into a register
+                        // and cannot be contained.
+                        return;
+                    }
+
+                    case NI_SSE2_ConvertToInt32:
+                    case NI_SSE2_X64_ConvertToInt64:
+                    case NI_SSE2_ConvertToUInt32:
+                    case NI_SSE2_X64_ConvertToUInt64:
+                    case NI_AVX2_ConvertToInt32:
+                    case NI_AVX2_ConvertToUInt32:
+                    {
+                        if (varTypeIsIntegral(baseType))
+                        {
+                            // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm" and don't
+                            // currently support containment.
+                            return;
+                        }
+
+                        break;
+                    }
+
+                    case NI_SSE41_ConvertToVector128Int16:
+                    case NI_SSE41_ConvertToVector128Int32:
+                    case NI_SSE41_ConvertToVector128Int64:
+                    case NI_AVX2_ConvertToVector256Int16:
+                    case NI_AVX2_ConvertToVector256Int32:
+                    case NI_AVX2_ConvertToVector256Int64:
+                        if (!varTypeIsSIMD(op1->gtType))
+                        {
+                            ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
+                            return;
+                        }
+                        break;
+
+                    default:
+                    {
+                        break;
+                    }
+                }
+
+                bool supportsRegOptional = false;
+
+                if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                {
+                    MakeSrcContained(node, op1);
+                }
+                else if (supportsRegOptional)
+                {
+                    op1->SetRegOptional();
+                }
+                break;
+            }
+
+            default:
+            {
+                unreached();
+                break;
+            }
+        }
+    }
+    else
+    {
+        if (numArgs == 2)
+        {
+            assert(!op1->OperIsList());
+            assert(op2 != nullptr);
+            assert(!op2->OperIsList());
+
+            switch (category)
+            {
+                case HW_Category_MemoryLoad:
+                    if ((intrinsicId == NI_AVX_MaskLoad) || (intrinsicId == NI_AVX2_MaskLoad))
+                    {
+                        ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
+                    }
+                    else
+                    {
+                        ContainCheckHWIntrinsicAddr(node, node->gtGetOp2());
+                    }
+                    break;
+
+                case HW_Category_MemoryStore:
+                    ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
+
+                    if (((intrinsicId == NI_SSE_Store) || (intrinsicId == NI_SSE2_Store)) && op2->OperIsHWIntrinsic() &&
+                        ((op2->AsHWIntrinsic()->gtHWIntrinsicId == NI_AVX_ExtractVector128) ||
+                         (op2->AsHWIntrinsic()->gtHWIntrinsicId == NI_AVX2_ExtractVector128)) &&
+                        op2->gtGetOp2()->IsIntegralConst())
+                    {
+                        MakeSrcContained(node, op2);
+                    }
+                    break;
+
+                case HW_Category_SimpleSIMD:
+                case HW_Category_SIMDScalar:
+                case HW_Category_Scalar:
+                {
+                    bool supportsRegOptional = false;
+
+                    if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                    {
+                        MakeSrcContained(node, op2);
+                    }
+                    else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) ||
+                              (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) &&
+                             IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                    {
+                        MakeSrcContained(node, op1);
+
+                        // Swap the operands here to make the containment checks in codegen significantly simpler
+                        node->gtOp1 = op2;
+                        node->gtOp2 = op1;
+                    }
+                    else if (supportsRegOptional)
+                    {
+                        op2->SetRegOptional();
+
+                        // TODO-XArch-CQ: For commutative nodes, either operand can be reg-optional.
+                        //                https://github.com/dotnet/runtime/issues/6358
+                    }
+                    break;
+                }
+
+                case HW_Category_IMM:
+                {
+                    // We don't currently have any IMM intrinsics which are also commutative
+                    assert(!isCommutative);
+                    bool supportsRegOptional = false;
+
+                    switch (intrinsicId)
+                    {
+                        case NI_SSE2_Extract:
+                        case NI_SSE41_Extract:
+                        case NI_SSE41_X64_Extract:
+                        case NI_AVX_ExtractVector128:
+                        case NI_AVX2_ExtractVector128:
+                        {
+                            // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm, imm8" and don't
+                            // currently support containment.
+                            break;
+                        }
+
+                        case NI_SSE2_ShiftLeftLogical:
+                        case NI_SSE2_ShiftRightArithmetic:
+                        case NI_SSE2_ShiftRightLogical:
+                        case NI_AVX2_ShiftLeftLogical:
+                        case NI_AVX2_ShiftRightArithmetic:
+                        case NI_AVX2_ShiftRightLogical:
+                        {
+                            // These intrinsics can have op2 be imm or reg/mem
+
+                            if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2))
+                            {
+                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                                {
+                                    MakeSrcContained(node, op2);
+                                }
+                                else if (supportsRegOptional)
+                                {
+                                    op2->SetRegOptional();
+                                }
+                            }
+                            break;
+                        }
+
+                        case NI_SSE2_Shuffle:
+                        case NI_SSE2_ShuffleHigh:
+                        case NI_SSE2_ShuffleLow:
+                        case NI_AVX2_Permute4x64:
+                        case NI_AVX2_Shuffle:
+                        case NI_AVX2_ShuffleHigh:
+                        case NI_AVX2_ShuffleLow:
+                        {
+                            // These intrinsics have op2 as an imm and op1 as a reg/mem
+
+                            if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                            {
+                                MakeSrcContained(node, op1);
+                            }
+                            else if (supportsRegOptional)
+                            {
+                                op1->SetRegOptional();
+                            }
+                            break;
+                        }
+
+                        case NI_AVX_Permute:
+                        {
+                            // These intrinsics can have op2 be imm or reg/mem
+                            // They also can have op1 be reg/mem and op2 be imm
+
+                            if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
+                            {
+                                if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                                {
+                                    MakeSrcContained(node, op1);
+                                }
+                                else if (supportsRegOptional)
+                                {
+                                    op1->SetRegOptional();
+                                }
+                            }
+                            else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                            {
+                                MakeSrcContained(node, op2);
+                            }
+                            else if (supportsRegOptional)
+                            {
+                                op2->SetRegOptional();
+                            }
+                            break;
+                        }
+
+                        case NI_AES_KeygenAssist:
+                        {
+                            if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                            {
+                                MakeSrcContained(node, op1);
+                            }
+                            else if (supportsRegOptional)
+                            {
+                                op1->SetRegOptional();
+                            }
+                            break;
+                        }
+
+                        case NI_SSE2_ShiftLeftLogical128BitLane:
+                        case NI_SSE2_ShiftRightLogical128BitLane:
+                        case NI_AVX2_ShiftLeftLogical128BitLane:
+                        case NI_AVX2_ShiftRightLogical128BitLane:
+                        {
+#if DEBUG
+                            // These intrinsics should have been marked contained by the general-purpose handling
+                            // earlier in the method.
+
+                            GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
+                            assert(lastOp != nullptr);
+
+                            if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
+                            {
+                                assert(lastOp->isContained());
+                            }
+#endif
+
+                            break;
+                        }
+
+                        default:
+                        {
+                            assert(!"Unhandled containment for binary hardware intrinsic with immediate operand");
+                            break;
+                        }
+                    }
+
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                    break;
+                }
+            }
+        }
+        else if (numArgs == 3)
+        {
+            // three argument intrinsics should not be marked commutative
+            assert(!isCommutative);
+
+            assert(op1->OperIsList());
+            assert(op2 == nullptr);
+
+            GenTreeArgList* argList         = op1->AsArgList();
+            GenTreeArgList* originalArgList = argList;
+
+            op1     = argList->Current();
+            argList = argList->Rest();
+
+            op2     = argList->Current();
+            argList = argList->Rest();
+
+            op3 = argList->Current();
+            assert(argList->Rest() == nullptr);
+
+            switch (category)
+            {
+                case HW_Category_MemoryStore:
+                    ContainCheckHWIntrinsicAddr(node, node->gtGetOp1()->AsOp()->gtGetOp1());
+                    break;
+
+                case HW_Category_SimpleSIMD:
+                case HW_Category_SIMDScalar:
+                case HW_Category_Scalar:
+                {
+                    if ((intrinsicId >= NI_FMA_MultiplyAdd) && (intrinsicId <= NI_FMA_MultiplySubtractNegatedScalar))
+                    {
+                        bool supportsRegOptional = false;
+
+                        if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional))
+                        {
+                            // 213 form: op1 = (op2 * op1) + [op3]
+                            MakeSrcContained(node, op3);
+                        }
+                        else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                        {
+                            // 132 form: op1 = (op1 * op3) + [op2]
+                            MakeSrcContained(node, op2);
+                        }
+                        else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                        {
+                            // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
+
+                            if (!HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
+                            {
+                                // 231 form: op3 = (op2 * op3) + [op1]
+                                MakeSrcContained(node, op1);
+                            }
+                        }
+                        else
+                        {
+                            assert(supportsRegOptional);
+
+                            // TODO-XArch-CQ: Technically any one of the three operands can
+                            //                be reg-optional. With a limitation on op1 where
+                            //                it can only be so if CopyUpperBits is off.
+                            //                https://github.com/dotnet/runtime/issues/6358
+
+                            // 213 form: op1 = (op2 * op1) + op3
+                            op3->SetRegOptional();
+                        }
+                    }
+                    else
+                    {
+                        bool supportsRegOptional = false;
+
+                        switch (intrinsicId)
+                        {
+                            case NI_SSE41_BlendVariable:
+                            case NI_AVX_BlendVariable:
+                            case NI_AVX2_BlendVariable:
+                            {
+                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                                {
+                                    MakeSrcContained(node, op2);
+                                }
+                                else if (supportsRegOptional)
+                                {
+                                    op2->SetRegOptional();
+                                }
+                                break;
+                            }
+
+                            case NI_BMI2_MultiplyNoFlags:
+                            case NI_BMI2_X64_MultiplyNoFlags:
+                            {
+                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                                {
+                                    MakeSrcContained(node, op2);
+                                }
+                                else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
+                                {
+                                    MakeSrcContained(node, op1);
+                                    // MultiplyNoFlags is a Commutative operation, so swap the first two operands here
+                                    // to make the containment checks in codegen significantly simpler
+                                    *(originalArgList->pCurrent())         = op2;
+                                    *(originalArgList->Rest()->pCurrent()) = op1;
+                                }
+                                else if (supportsRegOptional)
+                                {
+                                    op2->SetRegOptional();
+                                }
+                                break;
+                            }
+
+                            default:
+                            {
+                                unreached();
+                                break;
+                            }
+                        }
+                    }
+                    break;
+                }
+
+                case HW_Category_IMM:
+                {
+                    bool supportsRegOptional = false;
+
+                    switch (intrinsicId)
+                    {
+                        case NI_SSE_Shuffle:
+                        case NI_SSE2_Insert:
+                        case NI_SSE2_Shuffle:
+                        case NI_SSSE3_AlignRight:
+                        case NI_SSE41_Blend:
+                        case NI_SSE41_DotProduct:
+                        case NI_SSE41_Insert:
+                        case NI_SSE41_X64_Insert:
+                        case NI_SSE41_MultipleSumAbsoluteDifferences:
+                        case NI_AVX_Blend:
+                        case NI_AVX_Compare:
+                        case NI_AVX_CompareScalar:
+                        case NI_AVX_DotProduct:
+                        case NI_AVX_InsertVector128:
+                        case NI_AVX_Permute2x128:
+                        case NI_AVX_Shuffle:
+                        case NI_AVX2_AlignRight:
+                        case NI_AVX2_Blend:
+                        case NI_AVX2_InsertVector128:
+                        case NI_AVX2_MultipleSumAbsoluteDifferences:
+                        case NI_AVX2_Permute2x128:
+                        case NI_PCLMULQDQ_CarrylessMultiply:
+                        {
+                            if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
+                            {
+                                MakeSrcContained(node, op2);
+                            }
+                            else if (supportsRegOptional)
+                            {
+                                op2->SetRegOptional();
+                            }
+                            break;
+                        }
+
+                        default:
+                        {
+                            assert(!"Unhandled containment for ternary hardware intrinsic with immediate operand");
+                            break;
+                        }
+                    }
+
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                    break;
+                }
+            }
+        }
+        else
+        {
+            unreached();
+        }
+    }
+}
+#endif // FEATURE_HW_INTRINSICS
+
+//------------------------------------------------------------------------
+// ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
+//
+// Arguments:
+//    node - pointer to the node
+//
+void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
+{
+    assert(node->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV) && varTypeIsFloating(node));
+
+    // overflow operations aren't supported on float/double types.
+    assert(!node->gtOverflowEx());
+
+    GenTree* op1 = node->gtGetOp1();
+    GenTree* op2 = node->gtGetOp2();
+
+    // No implicit conversions at this stage as the expectation is that
+    // everything is made explicit by adding casts.
+    assert(op1->TypeGet() == op2->TypeGet());
+
+    bool isSafeToContainOp1 = true;
+    bool isSafeToContainOp2 = true;
+
+    if (op2->IsCnsNonZeroFltOrDbl())
+    {
+        MakeSrcContained(node, op2);
+    }
+    else if (IsContainableMemoryOp(op2))
+    {
+        isSafeToContainOp2 = IsSafeToContainMem(node, op2);
+        if (isSafeToContainOp2)
+        {
+            MakeSrcContained(node, op2);
+        }
+    }
+
+    if (!op2->isContained() && node->OperIsCommutative())
+    {
+        // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
+        // as long as it is safe so that the following efficient code sequence is generated:
+        //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
+        //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
+        //
+        // Instead of
+        //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
+        //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
+
+        if (op1->IsCnsNonZeroFltOrDbl())
+        {
+            MakeSrcContained(node, op1);
+        }
+        else if (IsContainableMemoryOp(op1))
+        {
+            isSafeToContainOp1 = IsSafeToContainMem(node, op1);
+            if (isSafeToContainOp1)
+            {
+                MakeSrcContained(node, op1);
+            }
+        }
+    }
+
+    if (!op1->isContained() && !op2->isContained())
+    {
+        // If there are no containable operands, we can make an operand reg optional.
+        // IsSafeToContainMem is expensive so we call it at most once for each operand
+        // in this method. If we already called IsSafeToContainMem, it must have returned false;
+        // otherwise, the corresponding operand (op1 or op2) would be contained.
+        isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
+        isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
+        SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
+    }
+}
+
+#endif // defined (TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
index 345cbb451f78..de77e0fa4cfe 100644
--- a/src/coreclr/jit/lsra.h
+++ b/src/coreclr/jit/lsra.h
@@ -735,7 +735,7 @@ class LinearScan : public LinearScanInterface
     // Hence the "SmallFPSet" has 5 elements.
     CLANG_FORMAT_COMMENT_ANCHOR;
 
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 #ifdef UNIX_AMD64_ABI
     // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
     static const regMaskTP LsraLimitSmallIntSet =
@@ -757,6 +757,17 @@ class LinearScan : public LinearScanInterface
 #elif defined(TARGET_X86)
     static const regMaskTP LsraLimitSmallIntSet = (RBM_EAX | RBM_ECX | RBM_EDI);
     static const regMaskTP LsraLimitSmallFPSet  = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7);
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#ifdef UNIX_AMD64_ABI
+    // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
+    static const regMaskTP LsraLimitSmallIntSet =
+        (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_R12 | RBM_R13);
+#else  // !UNIX_AMD64_ABI
+    // On Windows Amd64 use the RDI and RSI as callee saved registers.
+    static const regMaskTP LsraLimitSmallIntSet =
+        (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI);
+#endif // !UNIX_AMD64_ABI
+    static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7);
 #else
 #error Unsupported or unset target architecture
 #endif // target
@@ -1722,7 +1733,7 @@ class LinearScan : public LinearScanInterface
 
     void setDelayFree(RefPosition* use);
     int BuildBinaryUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
     int BuildRMWUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
 #endif // !TARGET_XARCH
     // This is the main entry point for building the RefPositions for a node.
@@ -1743,7 +1754,7 @@ class LinearScan : public LinearScanInterface
     void BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCandidates, regMaskTP killMask);
 
     int BuildReturn(GenTree* tree);
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
     // This method, unlike the others, returns the number of sources, since it may be called when
     // 'tree' is contained.
     int BuildShiftRotate(GenTree* tree);
@@ -1764,7 +1775,7 @@ class LinearScan : public LinearScanInterface
     int BuildGCWriteBarrier(GenTree* tree);
     int BuildCast(GenTreeCast* cast);
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
     // returns true if the tree can use the read-modify-write memory instruction form
     bool isRMWRegOper(GenTree* tree);
     int BuildMul(GenTree* tree);
diff --git a/src/coreclr/jit/lsrawasm.cpp b/src/coreclr/jit/lsrawasm.cpp
new file mode 100644
index 000000000000..67f7f658aa3f
--- /dev/null
+++ b/src/coreclr/jit/lsrawasm.cpp
@@ -0,0 +1,1528 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                    Register Requirements for AMD64                        XX
+XX                                                                           XX
+XX  This encapsulates all the logic for setting register requirements for    XX
+XX  the AMD64 architecture.                                                  XX
+XX                                                                           XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+#include "jit.h"
+#include "sideeffects.h"
+#include "lower.h"
+
+//------------------------------------------------------------------------
+// BuildNode: Build the RefPositions for for a node
+//
+// Arguments:
+//    treeNode - the node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+// Notes:
+// Preconditions:
+//    LSRA Has been initialized.
+//
+// Postconditions:
+//    RefPositions have been built for all the register defs and uses required
+//    for this node.
+//
+int LinearScan::BuildNode(GenTree* tree)
+{
+    assert(false);
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// getTgtPrefOperands: Identify whether the operands of an Op should be preferenced to the target.
+//
+// Arguments:
+//    tree    - the node of interest.
+//    prefOp1 - a bool "out" parameter indicating, on return, whether op1 should be preferenced to the target.
+//    prefOp2 - a bool "out" parameter indicating, on return, whether op2 should be preferenced to the target.
+//
+// Return Value:
+//    This has two "out" parameters for returning the results (see above).
+//
+// Notes:
+//    The caller is responsible for initializing the two "out" parameters to false.
+//
+void LinearScan::getTgtPrefOperands(GenTreeOp* tree, bool& prefOp1, bool& prefOp2)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------------
+// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
+//
+// Arguments:
+//    tree      - a binary tree node
+//
+// Return Value:
+//    Returns true if we can use the read-modify-write instruction form
+//
+// Notes:
+//    This is used to determine whether to preference the source to the destination register.
+//
+bool LinearScan::isRMWRegOper(GenTree* tree)
+{
+    assert(false);
+    return false;
+}
+
+// Support for building RefPositions for RMW nodes.
+int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
+{
+    assert(false);
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// BuildShiftRotate: Set the NodeInfo for a shift or rotate.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildShiftRotate(GenTree* tree)
+{
+    assert(false);
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// BuildCall: Set the NodeInfo for a call.
+//
+// Arguments:
+//    call      - The call node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildCall(GenTreeCall* call)
+{
+    bool                  hasMultiRegRetVal = false;
+    const ReturnTypeDesc* retTypeDesc       = nullptr;
+    int                   srcCount          = 0;
+    int                   dstCount          = 0;
+    regMaskTP             dstCandidates     = RBM_NONE;
+
+    assert(!call->isContained());
+    if (call->TypeGet() != TYP_VOID)
+    {
+        hasMultiRegRetVal = call->HasMultiRegRetVal();
+        if (hasMultiRegRetVal)
+        {
+            // dst count = number of registers in which the value is returned by call
+            retTypeDesc = call->GetReturnTypeDesc();
+            dstCount    = retTypeDesc->GetReturnRegCount();
+        }
+        else
+        {
+            dstCount = 1;
+        }
+    }
+
+    GenTree* ctrlExpr = call->gtControlExpr;
+    if (call->gtCallType == CT_INDIRECT)
+    {
+        ctrlExpr = call->gtCallAddr;
+    }
+
+    RegisterType registerType = regType(call);
+
+    // Set destination candidates for return value of the call.
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#ifdef TARGET_X86
+    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
+    {
+        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
+        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
+        // correct argument registers.
+        dstCandidates = RBM_PINVOKE_TCB;
+    }
+    else
+#endif // TARGET_X86
+        if (hasMultiRegRetVal)
+    {
+        assert(retTypeDesc != nullptr);
+        dstCandidates = retTypeDesc->GetABIReturnRegs();
+        assert((int)genCountBits(dstCandidates) == dstCount);
+    }
+    else if (varTypeUsesFloatReg(registerType))
+    {
+#ifdef TARGET_X86
+        // The return value will be on the X87 stack, and we will need to move it.
+        dstCandidates = allRegs(registerType);
+#else  // !TARGET_X86
+        dstCandidates = RBM_FLOATRET;
+#endif // !TARGET_X86
+    }
+    else if (registerType == TYP_LONG)
+    {
+        dstCandidates = RBM_LNGRET;
+    }
+    else
+    {
+        dstCandidates = RBM_INTRET;
+    }
+
+    // number of args to a call =
+    // callRegArgs + (callargs - placeholders, setup, etc)
+    // there is an explicit thisPtr but it is redundant
+
+    bool callHasFloatRegArgs = false;
+    bool isVarArgs           = call->IsVarargs();
+
+    // First, determine internal registers.
+    // We will need one for any float arguments to a varArgs call.
+    for (GenTreeCall::Use& use : call->LateArgs())
+    {
+        GenTree* argNode = use.GetNode();
+        if (argNode->OperIsPutArgReg())
+        {
+            HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
+        }
+        else if (argNode->OperGet() == GT_FIELD_LIST)
+        {
+            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
+            {
+                assert(use.GetNode()->OperIsPutArgReg());
+                HandleFloatVarArgs(call, use.GetNode(), &callHasFloatRegArgs);
+            }
+        }
+    }
+
+    // Now, count reg args
+    for (GenTreeCall::Use& use : call->LateArgs())
+    {
+        // By this point, lowering has ensured that all call arguments are one of the following:
+        // - an arg setup store
+        // - an arg placeholder
+        // - a nop
+        // - a copy blk
+        // - a field list
+        // - a put arg
+        //
+        // Note that this property is statically checked by LinearScan::CheckBlock.
+        GenTree* argNode = use.GetNode();
+
+        // Each register argument corresponds to one source.
+        if (argNode->OperIsPutArgReg())
+        {
+            srcCount++;
+            BuildUse(argNode, genRegMask(argNode->GetRegNum()));
+        }
+#ifdef UNIX_AMD64_ABI
+        else if (argNode->OperGet() == GT_FIELD_LIST)
+        {
+            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
+            {
+                assert(use.GetNode()->OperIsPutArgReg());
+                srcCount++;
+                BuildUse(use.GetNode(), genRegMask(use.GetNode()->GetRegNum()));
+            }
+        }
+#endif // UNIX_AMD64_ABI
+
+#ifdef DEBUG
+        // In DEBUG only, check validity with respect to the arg table entry.
+
+        fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
+        assert(curArgTabEntry);
+
+        if (curArgTabEntry->GetRegNum() == REG_STK)
+        {
+            // late arg that is not passed in a register
+            assert(argNode->gtOper == GT_PUTARG_STK);
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+            // If the node is TYP_STRUCT and it is put on stack with
+            // putarg_stk operation, we consume and produce no registers.
+            // In this case the embedded Obj node should not produce
+            // registers too since it is contained.
+            // Note that if it is a SIMD type the argument will be in a register.
+            if (argNode->TypeGet() == TYP_STRUCT)
+            {
+                assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
+                assert(argNode->gtGetOp1()->isContained());
+            }
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+            continue;
+        }
+#ifdef UNIX_AMD64_ABI
+        if (argNode->OperGet() == GT_FIELD_LIST)
+        {
+            assert(argNode->isContained());
+            assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
+
+            unsigned regIndex = 0;
+            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
+            {
+                const regNumber argReg = curArgTabEntry->GetRegNum(regIndex);
+                assert(use.GetNode()->GetRegNum() == argReg);
+                regIndex++;
+            }
+        }
+        else
+#endif // UNIX_AMD64_ABI
+        {
+            const regNumber argReg = curArgTabEntry->GetRegNum();
+            assert(argNode->GetRegNum() == argReg);
+        }
+#endif // DEBUG
+    }
+
+#ifdef DEBUG
+    // Now, count stack args
+    // Note that these need to be computed into a register, but then
+    // they're just stored to the stack - so the reg doesn't
+    // need to remain live until the call.  In fact, it must not
+    // because the code generator doesn't actually consider it live,
+    // so it can't be spilled.
+
+    for (GenTreeCall::Use& use : call->Args())
+    {
+        GenTree* arg = use.GetNode();
+        if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
+        {
+            if (arg->IsValue() && !arg->isContained())
+            {
+                assert(arg->IsUnusedValue());
+            }
+        }
+    }
+#endif // DEBUG
+
+    // set reg requirements on call target represented as control sequence.
+    if (ctrlExpr != nullptr)
+    {
+        regMaskTP ctrlExprCandidates = RBM_NONE;
+
+        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
+        // computed into a register.
+        if (call->IsFastTailCall())
+        {
+            assert(!ctrlExpr->isContained());
+            // Fast tail call - make sure that call target is always computed in RAX
+            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
+            ctrlExprCandidates = RBM_RAX;
+        }
+#ifdef TARGET_X86
+        else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
+        {
+            // On x86, we need to generate a very specific pattern for indirect VSD calls:
+            //
+            //    3-byte nop
+            //    call dword ptr [eax]
+            //
+            // Where EAX is also used as an argument to the stub dispatch helper. Make
+            // sure that the call target address is computed into EAX in this case.
+            assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
+            ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
+        }
+#endif // TARGET_X86
+
+#if FEATURE_VARARG
+        // If it is a fast tail call, it is already preferenced to use RAX.
+        // Therefore, no need set src candidates on call tgt again.
+        if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
+        {
+            // Don't assign the call target to any of the argument registers because
+            // we will use them to also pass floating point arguments as required
+            // by Amd64 ABI.
+            ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
+        }
+#endif // !FEATURE_VARARG
+        srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
+    }
+
+    buildInternalRegisterUses();
+
+    // Now generate defs and kills.
+    regMaskTP killMask = getKillSetForCall(call);
+    BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
+    return srcCount;
+}
+
+//------------------------------------------------------------------------
+// BuildBlockStore: Build the RefPositions for a block store node.
+//
+// Arguments:
+//    blkNode - The block store node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
+{
+    assert(false);
+    return 0;
+}
+
+#ifdef FEATURE_PUT_STRUCT_ARG_STK
+//------------------------------------------------------------------------
+// BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
+{
+    assert(false);
+    return 0;
+}
+#endif // FEATURE_PUT_STRUCT_ARG_STK
+
+//------------------------------------------------------------------------
+// BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildLclHeap(GenTree* tree)
+{
+    int srcCount = 1;
+
+    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+    // Here '-' means don't care.
+    //
+    //     Size?                    Init Memory?         # temp regs
+    //      0                            -                  0 (returns 0)
+    //      const and <=6 reg words      -                  0 (pushes '0')
+    //      const and >6 reg words       Yes                0 (pushes '0')
+    //      const and <PageSize          No                 0 (amd64) 1 (x86)
+    //                                                        (x86:tmpReg for sutracting from esp)
+    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
+    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
+    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
+    //
+    // Note: Here we don't need internal register to be different from targetReg.
+    // Rather, require it to be different from operand's reg.
+
+    GenTree* size = tree->gtGetOp1();
+    if (size->IsCnsIntOrI())
+    {
+        assert(size->isContained());
+        srcCount       = 0;
+        size_t sizeVal = size->AsIntCon()->gtIconVal;
+
+        if (sizeVal == 0)
+        {
+            buildInternalIntRegisterDefForNode(tree);
+        }
+        else
+        {
+            // Compute the amount of memory to properly STACK_ALIGN.
+            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
+            // This should also help in debugging as we can examine the original size specified with localloc.
+            sizeVal = AlignUp(sizeVal, STACK_ALIGN);
+
+            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
+            // we will generate 'push 0'.
+            assert((sizeVal % REGSIZE_BYTES) == 0);
+            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
+            if (cntRegSizedWords > 6)
+            {
+                if (!compiler->info.compInitMem)
+                {
+                    // No need to initialize allocated stack space.
+                    if (sizeVal < compiler->eeGetPageSize())
+                    {
+#ifdef TARGET_X86
+                        // x86 needs a register here to avoid generating "sub" on ESP.
+                        buildInternalIntRegisterDefForNode(tree);
+#endif
+                    }
+                    else
+                    {
+                        // We need two registers: regCnt and RegTmp
+                        buildInternalIntRegisterDefForNode(tree);
+                        buildInternalIntRegisterDefForNode(tree);
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        if (!compiler->info.compInitMem)
+        {
+            buildInternalIntRegisterDefForNode(tree);
+            buildInternalIntRegisterDefForNode(tree);
+        }
+        BuildUse(size);
+    }
+    buildInternalRegisterUses();
+    BuildDef(tree);
+    return srcCount;
+}
+
+//------------------------------------------------------------------------
+// BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildModDiv(GenTree* tree)
+{
+    GenTree*     op1           = tree->gtGetOp1();
+    GenTree*     op2           = tree->gtGetOp2();
+    regMaskTP    dstCandidates = RBM_NONE;
+    RefPosition* internalDef   = nullptr;
+    int          srcCount      = 0;
+
+    if (varTypeIsFloating(tree->TypeGet()))
+    {
+        return BuildSimple(tree);
+    }
+
+    // Amd64 Div/Idiv instruction:
+    //    Dividend in RAX:RDX  and computes
+    //    Quotient in RAX, Remainder in RDX
+
+    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
+    {
+        // We are interested in just the remainder.
+        // RAX is used as a trashable register during computation of remainder.
+        dstCandidates = RBM_RDX;
+    }
+    else
+    {
+        // We are interested in just the quotient.
+        // RDX gets used as trashable register during computation of quotient
+        dstCandidates = RBM_RAX;
+    }
+
+#ifdef TARGET_X86
+    if (op1->OperGet() == GT_LONG)
+    {
+        assert(op1->isContained());
+
+        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
+        GenTree* loVal = op1->gtGetOp1();
+        GenTree* hiVal = op1->gtGetOp2();
+        assert(!loVal->isContained() && !hiVal->isContained());
+
+        assert(op2->IsCnsIntOrI());
+        assert(tree->OperGet() == GT_UMOD);
+
+        // This situation also requires an internal register.
+        buildInternalIntRegisterDefForNode(tree);
+
+        BuildUse(loVal, RBM_EAX);
+        BuildUse(hiVal, RBM_EDX);
+        srcCount = 2;
+    }
+    else
+#endif
+    {
+        // If possible would like to have op1 in RAX to avoid a register move.
+        RefPosition* op1Use = BuildUse(op1, RBM_EAX);
+        tgtPrefUse          = op1Use;
+        srcCount            = 1;
+    }
+
+    srcCount += BuildDelayFreeUses(op2, op1, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
+
+    buildInternalRegisterUses();
+
+    regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
+    BuildDefsWithKills(tree, 1, dstCandidates, killMask);
+    return srcCount;
+}
+
+//------------------------------------------------------------------------
+// BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildIntrinsic(GenTree* tree)
+{
+    // Both operand and its result must be of floating point type.
+    GenTree* op1 = tree->gtGetOp1();
+    assert(varTypeIsFloating(op1));
+    assert(op1->TypeGet() == tree->TypeGet());
+    RefPosition* internalFloatDef = nullptr;
+
+    switch (tree->AsIntrinsic()->gtIntrinsicName)
+    {
+        case NI_System_Math_Abs:
+            // Abs(float x) = x & 0x7fffffff
+            // Abs(double x) = x & 0x7ffffff ffffffff
+
+            // In case of Abs we need an internal register to hold mask.
+
+            // TODO-XArch-CQ: avoid using an internal register for the mask.
+            // Andps or andpd both will operate on 128-bit operands.
+            // The data section constant to hold the mask is a 64-bit size.
+            // Therefore, we need both the operand and mask to be in
+            // xmm register. When we add support in emitter to emit 128-bit
+            // data constants and instructions that operate on 128-bit
+            // memory operands we can avoid the need for an internal register.
+            internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
+            break;
+
+#ifdef TARGET_X86
+        case NI_System_Math_Cos:
+        case NI_System_Math_Sin:
+            NYI_X86("Math intrinsics Cos and Sin");
+            break;
+#endif // TARGET_X86
+
+        case NI_System_Math_Sqrt:
+        case NI_System_Math_Round:
+        case NI_System_Math_Ceiling:
+        case NI_System_Math_Floor:
+            break;
+
+        default:
+            // Right now only Sqrt/Abs are treated as math intrinsics
+            noway_assert(!"Unsupported math intrinsic");
+            unreached();
+            break;
+    }
+    assert(tree->gtGetOp2IfPresent() == nullptr);
+    int srcCount;
+    if (op1->isContained())
+    {
+        srcCount = BuildOperandUses(op1);
+    }
+    else
+    {
+        tgtPrefUse = BuildUse(op1);
+        srcCount   = 1;
+    }
+    if (internalFloatDef != nullptr)
+    {
+        buildInternalRegisterUses();
+    }
+    BuildDef(tree);
+    return srcCount;
+}
+
+#ifdef FEATURE_SIMD
+//------------------------------------------------------------------------
+// BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
+//
+// Arguments:
+//    tree       - The GT_SIMD node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
+{
+    // All intrinsics have a dstCount of 1
+    assert(simdTree->IsValue());
+
+    bool      buildUses     = true;
+    regMaskTP dstCandidates = RBM_NONE;
+
+    if (simdTree->isContained())
+    {
+        // Only SIMDIntrinsicInit can be contained
+        assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
+    }
+    SetContainsAVXFlags(simdTree->gtSIMDSize);
+    GenTree* op1      = simdTree->gtGetOp1();
+    GenTree* op2      = simdTree->gtGetOp2();
+    int      srcCount = 0;
+
+    switch (simdTree->gtSIMDIntrinsicID)
+    {
+        case SIMDIntrinsicInit:
+        {
+            // This sets all fields of a SIMD struct to the given value.
+            // Mark op1 as contained if it is either zero or int constant of all 1's,
+            // or a float constant with 16 or 32 byte simdType (AVX case)
+            //
+            // Note that for small int base types, the initVal has been constructed so that
+            // we can use the full int value.
+            CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if !defined(TARGET_64BIT)
+            if (op1->OperGet() == GT_LONG)
+            {
+                assert(op1->isContained());
+                GenTree* op1lo = op1->gtGetOp1();
+                GenTree* op1hi = op1->gtGetOp2();
+
+                if (op1lo->isContained())
+                {
+                    srcCount = 0;
+                    assert(op1hi->isContained());
+                    assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+                           (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
+                }
+                else
+                {
+                    srcCount = 2;
+                    buildInternalFloatRegisterDefForNode(simdTree);
+                    setInternalRegsDelayFree = true;
+                }
+
+                if (srcCount == 2)
+                {
+                    BuildUse(op1lo, RBM_EAX);
+                    BuildUse(op1hi, RBM_EDX);
+                }
+                buildUses = false;
+            }
+#endif // !defined(TARGET_64BIT)
+        }
+        break;
+
+        case SIMDIntrinsicInitN:
+        {
+            var_types baseType = simdTree->gtSIMDBaseType;
+            srcCount           = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
+            // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
+            buildInternalFloatRegisterDefForNode(simdTree);
+            int initCount = 0;
+            for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
+            {
+                assert(list->OperGet() == GT_LIST);
+                GenTree* listItem = list->gtGetOp1();
+                assert(listItem->TypeGet() == baseType);
+                assert(!listItem->isContained());
+                BuildUse(listItem);
+                initCount++;
+            }
+            assert(initCount == srcCount);
+            buildUses = false;
+        }
+        break;
+
+        case SIMDIntrinsicInitArray:
+            // We have an array and an index, which may be contained.
+            break;
+
+        case SIMDIntrinsicSub:
+        case SIMDIntrinsicBitwiseAnd:
+        case SIMDIntrinsicBitwiseOr:
+            break;
+
+        case SIMDIntrinsicEqual:
+            break;
+
+        case SIMDIntrinsicGetItem:
+        {
+            // This implements get_Item method. The sources are:
+            //  - the source SIMD struct
+            //  - index (which element to get)
+            // The result is baseType of SIMD struct.
+            // op1 may be a contained memory op, but if so we will consume its address.
+            // op2 may be a contained constant.
+            op1 = simdTree->gtGetOp1();
+            op2 = simdTree->gtGetOp2();
+
+            if (!op1->isContained())
+            {
+                // If the index is not a constant, we will use the SIMD temp location to store the vector.
+                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
+                // can use that in the process of extracting the element.
+                //
+                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
+                // we will need a temp if are indexing into the upper half of the AVX register.
+                // In all other cases with constant index, we need a temp xmm register to extract the
+                // element if index is other than zero.
+
+                if (!op2->IsCnsIntOrI())
+                {
+                    (void)compiler->getSIMDInitTempVarNum();
+                }
+                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
+                {
+                    bool needFloatTemp;
+                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
+                        (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
+                    {
+                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
+                        needFloatTemp    = (byteShiftCnt >= 16);
+                    }
+                    else
+                    {
+                        needFloatTemp = !op2->IsIntegralConst(0);
+                    }
+
+                    if (needFloatTemp)
+                    {
+                        buildInternalFloatRegisterDefForNode(simdTree);
+                    }
+                }
+#ifdef TARGET_X86
+                // This logic is duplicated from genSIMDIntrinsicGetItem().
+                // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+                // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+                // cases will require this, so the non-byteable registers can be excluded.
+
+                var_types baseType = simdTree->gtSIMDBaseType;
+                if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+                {
+                    bool     ZeroOrSignExtnReqd = true;
+                    unsigned baseSize           = genTypeSize(baseType);
+                    if (baseSize == 1)
+                    {
+                        if ((op2->AsIntCon()->gtIconVal % 2) == 1)
+                        {
+                            ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+                        }
+                    }
+                    else
+                    {
+                        assert(baseSize == 2);
+                        ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+                    }
+                    if (ZeroOrSignExtnReqd)
+                    {
+                        dstCandidates = allByteRegs();
+                    }
+                }
+#endif // TARGET_X86
+            }
+        }
+        break;
+
+        case SIMDIntrinsicSetX:
+        case SIMDIntrinsicSetY:
+        case SIMDIntrinsicSetZ:
+        case SIMDIntrinsicSetW:
+            // We need an internal integer register for SSE2 codegen
+            if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
+            {
+                buildInternalIntRegisterDefForNode(simdTree);
+            }
+
+            break;
+
+        case SIMDIntrinsicCast:
+            break;
+
+        case SIMDIntrinsicConvertToSingle:
+            if (simdTree->gtSIMDBaseType == TYP_UINT)
+            {
+                // We need an internal register different from targetReg.
+                setInternalRegsDelayFree = true;
+                buildInternalFloatRegisterDefForNode(simdTree);
+                buildInternalFloatRegisterDefForNode(simdTree);
+                // We also need an integer register.
+                buildInternalIntRegisterDefForNode(simdTree);
+            }
+            break;
+
+        case SIMDIntrinsicConvertToInt32:
+            break;
+
+        case SIMDIntrinsicWidenLo:
+        case SIMDIntrinsicWidenHi:
+            if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
+            {
+                // We need an internal register different from targetReg.
+                setInternalRegsDelayFree = true;
+                buildInternalFloatRegisterDefForNode(simdTree);
+            }
+            break;
+
+        case SIMDIntrinsicConvertToInt64:
+            // We need an internal register different from targetReg.
+            setInternalRegsDelayFree = true;
+            buildInternalFloatRegisterDefForNode(simdTree);
+            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
+            {
+                buildInternalFloatRegisterDefForNode(simdTree);
+            }
+            // We also need an integer register.
+            buildInternalIntRegisterDefForNode(simdTree);
+            break;
+
+        case SIMDIntrinsicConvertToDouble:
+            // We need an internal register different from targetReg.
+            setInternalRegsDelayFree = true;
+            buildInternalFloatRegisterDefForNode(simdTree);
+#ifdef TARGET_X86
+            if (simdTree->gtSIMDBaseType == TYP_LONG)
+            {
+                buildInternalFloatRegisterDefForNode(simdTree);
+                buildInternalFloatRegisterDefForNode(simdTree);
+            }
+            else
+#endif
+                if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
+            {
+                buildInternalFloatRegisterDefForNode(simdTree);
+            }
+            // We also need an integer register.
+            buildInternalIntRegisterDefForNode(simdTree);
+            break;
+
+        case SIMDIntrinsicNarrow:
+            // We need an internal register different from targetReg.
+            setInternalRegsDelayFree = true;
+            buildInternalFloatRegisterDefForNode(simdTree);
+            if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
+            {
+                buildInternalFloatRegisterDefForNode(simdTree);
+            }
+            break;
+
+        case SIMDIntrinsicShuffleSSE2:
+            // Second operand is an integer constant and marked as contained.
+            assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
+            break;
+
+        case SIMDIntrinsicGetX:
+        case SIMDIntrinsicGetY:
+        case SIMDIntrinsicGetZ:
+        case SIMDIntrinsicGetW:
+            assert(!"Get intrinsics should not be seen during Lowering.");
+            unreached();
+
+        default:
+            noway_assert(!"Unimplemented SIMD node type.");
+            unreached();
+    }
+    if (buildUses)
+    {
+        assert(!op1->OperIs(GT_LIST));
+        assert(srcCount == 0);
+        // This is overly conservative, but is here for zero diffs.
+        srcCount = BuildRMWUses(simdTree);
+    }
+    buildInternalRegisterUses();
+    BuildDef(simdTree, dstCandidates);
+    return srcCount;
+}
+#endif // FEATURE_SIMD
+
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC tree.
+//
+// Arguments:
+//    tree       - The GT_HWINTRINSIC node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
+{
+    NamedIntrinsic         intrinsicId = intrinsicTree->gtHWIntrinsicId;
+    var_types              baseType    = intrinsicTree->gtSIMDBaseType;
+    CORINFO_InstructionSet isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
+    HWIntrinsicCategory    category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
+    int                    numArgs     = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
+
+    // Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
+    // Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
+    // or non-AVX intrinsics that will use VEX encoding if it is available on the target).
+    if (intrinsicTree->isSIMD())
+    {
+        SetContainsAVXFlags(intrinsicTree->gtSIMDSize);
+    }
+
+    GenTree* op1    = intrinsicTree->gtGetOp1();
+    GenTree* op2    = intrinsicTree->gtGetOp2();
+    GenTree* op3    = nullptr;
+    GenTree* lastOp = nullptr;
+
+    int srcCount = 0;
+    int dstCount = intrinsicTree->IsValue() ? 1 : 0;
+
+    regMaskTP dstCandidates = RBM_NONE;
+
+    if (op1 == nullptr)
+    {
+        assert(op2 == nullptr);
+        assert(numArgs == 0);
+    }
+    else
+    {
+        if (op1->OperIsList())
+        {
+            assert(op2 == nullptr);
+            assert(numArgs >= 3);
+
+            GenTreeArgList* argList = op1->AsArgList();
+
+            op1     = argList->Current();
+            argList = argList->Rest();
+
+            op2     = argList->Current();
+            argList = argList->Rest();
+
+            op3 = argList->Current();
+
+            while (argList->Rest() != nullptr)
+            {
+                argList = argList->Rest();
+            }
+
+            lastOp  = argList->Current();
+            argList = argList->Rest();
+
+            assert(argList == nullptr);
+        }
+        else if (op2 != nullptr)
+        {
+            assert(numArgs == 2);
+            lastOp = op2;
+        }
+        else
+        {
+            assert(numArgs == 1);
+            lastOp = op1;
+        }
+
+        assert(lastOp != nullptr);
+
+        bool buildUses = true;
+
+        if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
+        {
+            if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
+            {
+                assert(!lastOp->IsCnsIntOrI());
+
+                // We need two extra reg when lastOp isn't a constant so
+                // the offset into the jump table for the fallback path
+                // can be computed.
+                buildInternalIntRegisterDefForNode(intrinsicTree);
+                buildInternalIntRegisterDefForNode(intrinsicTree);
+            }
+        }
+
+        // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
+        // is not allocated the same register as the target.
+        bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
+
+        // Create internal temps, and handle any other special requirements.
+        // Note that the default case for building uses will handle the RMW flag, but if the uses
+        // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
+        // must be handled within the case.
+        switch (intrinsicId)
+        {
+            case NI_Vector128_CreateScalarUnsafe:
+            case NI_Vector128_ToScalar:
+            case NI_Vector256_CreateScalarUnsafe:
+            case NI_Vector256_ToScalar:
+            {
+                assert(numArgs == 1);
+
+                if (varTypeIsFloating(baseType))
+                {
+                    if (op1->isContained())
+                    {
+                        srcCount += BuildOperandUses(op1);
+                    }
+                    else
+                    {
+                        // We will either be in memory and need to be moved
+                        // into a register of the appropriate size or we
+                        // are already in an XMM/YMM register and can stay
+                        // where we are.
+
+                        tgtPrefUse = BuildUse(op1);
+                        srcCount += 1;
+                    }
+
+                    buildUses = false;
+                }
+                break;
+            }
+
+            case NI_Vector128_ToVector256:
+            case NI_Vector128_ToVector256Unsafe:
+            case NI_Vector256_GetLower:
+            {
+                assert(numArgs == 1);
+
+                if (op1->isContained())
+                {
+                    srcCount += BuildOperandUses(op1);
+                }
+                else
+                {
+                    // We will either be in memory and need to be moved
+                    // into a register of the appropriate size or we
+                    // are already in an XMM/YMM register and can stay
+                    // where we are.
+
+                    tgtPrefUse = BuildUse(op1);
+                    srcCount += 1;
+                }
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_SSE2_MaskMove:
+            {
+                assert(numArgs == 3);
+                assert(!isRMW);
+
+                // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
+                srcCount += BuildOperandUses(op1);
+                srcCount += BuildOperandUses(op2);
+                srcCount += BuildOperandUses(op3, RBM_EDI);
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_SSE41_BlendVariable:
+            {
+                assert(numArgs == 3);
+
+                if (!compiler->canUseVexEncoding())
+                {
+                    assert(isRMW);
+
+                    // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
+                    tgtPrefUse = BuildUse(op1);
+
+                    srcCount += 1;
+                    srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1);
+                    srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0);
+
+                    buildUses = false;
+                }
+                break;
+            }
+
+            case NI_SSE41_Extract:
+            {
+                if (baseType == TYP_FLOAT)
+                {
+                    buildInternalIntRegisterDefForNode(intrinsicTree);
+                }
+#ifdef TARGET_X86
+                else if (varTypeIsByte(baseType))
+                {
+                    dstCandidates = allByteRegs();
+                }
+#endif
+                break;
+            }
+
+#ifdef TARGET_X86
+            case NI_SSE42_Crc32:
+            case NI_SSE42_X64_Crc32:
+            {
+                // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
+                // to the code generator. We may want to encode the overload info in another way.
+
+                assert(numArgs == 2);
+                assert(isRMW);
+
+                // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
+                tgtPrefUse = BuildUse(op1);
+
+                srcCount += 1;
+                srcCount += BuildDelayFreeUses(op2, op1, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
+
+                buildUses = false;
+                break;
+            }
+#endif // TARGET_X86
+
+            case NI_BMI2_MultiplyNoFlags:
+            case NI_BMI2_X64_MultiplyNoFlags:
+            {
+                assert(numArgs == 2 || numArgs == 3);
+                srcCount += BuildOperandUses(op1, RBM_EDX);
+                srcCount += BuildOperandUses(op2);
+                if (numArgs == 3)
+                {
+                    // op3 reg should be different from target reg to
+                    // store the lower half result after executing the instruction
+                    srcCount += BuildDelayFreeUses(op3, op1);
+                    // Need a internal register different from the dst to take the lower half result
+                    buildInternalIntRegisterDefForNode(intrinsicTree);
+                    setInternalRegsDelayFree = true;
+                }
+                buildUses = false;
+                break;
+            }
+
+            case NI_FMA_MultiplyAdd:
+            case NI_FMA_MultiplyAddNegated:
+            case NI_FMA_MultiplyAddNegatedScalar:
+            case NI_FMA_MultiplyAddScalar:
+            case NI_FMA_MultiplyAddSubtract:
+            case NI_FMA_MultiplySubtract:
+            case NI_FMA_MultiplySubtractAdd:
+            case NI_FMA_MultiplySubtractNegated:
+            case NI_FMA_MultiplySubtractNegatedScalar:
+            case NI_FMA_MultiplySubtractScalar:
+            {
+                assert(numArgs == 3);
+                assert(isRMW);
+
+                const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
+
+                // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
+                assert(!copiesUpperBits || !op1->isContained());
+
+                if (op2->isContained())
+                {
+                    // 132 form: op1 = (op1 * op3) + [op2]
+
+                    tgtPrefUse = BuildUse(op1);
+
+                    srcCount += 1;
+                    srcCount += BuildOperandUses(op2);
+                    srcCount += BuildDelayFreeUses(op3, op1);
+                }
+                else if (op1->isContained())
+                {
+                    // 231 form: op3 = (op2 * op3) + [op1]
+
+                    tgtPrefUse = BuildUse(op3);
+
+                    srcCount += BuildOperandUses(op1);
+                    srcCount += BuildDelayFreeUses(op2, op1);
+                    srcCount += 1;
+                }
+                else
+                {
+                    // 213 form: op1 = (op2 * op1) + [op3]
+
+                    tgtPrefUse = BuildUse(op1);
+                    srcCount += 1;
+
+                    if (copiesUpperBits)
+                    {
+                        srcCount += BuildDelayFreeUses(op2, op1);
+                    }
+                    else
+                    {
+                        tgtPrefUse2 = BuildUse(op2);
+                        srcCount += 1;
+                    }
+
+                    srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1);
+                }
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_AVX2_GatherVector128:
+            case NI_AVX2_GatherVector256:
+            {
+                assert(numArgs == 3);
+                assert(!isRMW);
+
+                // Any pair of the index, mask, or destination registers should be different
+                srcCount += BuildOperandUses(op1);
+                srcCount += BuildDelayFreeUses(op2, op1);
+
+                // op3 should always be contained
+                assert(op3->isContained());
+
+                // get a tmp register for mask that will be cleared by gather instructions
+                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
+                setInternalRegsDelayFree = true;
+
+                buildUses = false;
+                break;
+            }
+
+            case NI_AVX2_GatherMaskVector128:
+            case NI_AVX2_GatherMaskVector256:
+            {
+                assert(numArgs == 5);
+                assert(!isRMW);
+                assert(intrinsicTree->gtGetOp1()->OperIsList());
+
+                GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest();
+                GenTree*        op4     = argList->Current();
+
+                // Any pair of the index, mask, or destination registers should be different
+                srcCount += BuildOperandUses(op1);
+                srcCount += BuildDelayFreeUses(op2);
+                srcCount += BuildDelayFreeUses(op3);
+                srcCount += BuildDelayFreeUses(op4);
+
+                // op5 should always be contained
+                assert(argList->Rest()->Current()->isContained());
+
+                // get a tmp register for mask that will be cleared by gather instructions
+                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
+                setInternalRegsDelayFree = true;
+
+                buildUses = false;
+                break;
+            }
+
+            default:
+            {
+                assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
+                break;
+            }
+        }
+
+        if (buildUses)
+        {
+            assert((numArgs > 0) && (numArgs < 4));
+
+            if (intrinsicTree->OperIsMemoryLoadOrStore())
+            {
+                srcCount += BuildAddrUses(op1);
+            }
+            else if (isRMW && !op1->isContained())
+            {
+                tgtPrefUse = BuildUse(op1);
+                srcCount += 1;
+            }
+            else
+            {
+                srcCount += BuildOperandUses(op1);
+            }
+
+            if (op2 != nullptr)
+            {
+                if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
+                {
+                    srcCount += BuildAddrUses(op2->gtGetOp1());
+                }
+                else if (isRMW)
+                {
+                    if (!op2->isContained() && HWIntrinsicInfo::IsCommutative(intrinsicId))
+                    {
+                        // When op2 is not contained and we are commutative, we can set op2
+                        // to also be a tgtPrefUse. Codegen will then swap the operands.
+
+                        tgtPrefUse2 = BuildUse(op2);
+                        srcCount += 1;
+                    }
+                    else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet()))
+                    {
+                        // When op2 is not contained or if we are producing a scalar value
+                        // we need to mark it as delay free because the operand and target
+                        // exist in the same register set.
+
+                        srcCount += BuildDelayFreeUses(op2);
+                    }
+                    else
+                    {
+                        // When op2 is contained and we are not producing a scalar value we
+                        // have no concerns of overwriting op2 because they exist in different
+                        // register sets.
+
+                        srcCount += BuildOperandUses(op2);
+                    }
+                }
+                else
+                {
+                    srcCount += BuildOperandUses(op2);
+                }
+
+                if (op3 != nullptr)
+                {
+                    srcCount += isRMW ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
+                }
+            }
+        }
+
+        buildInternalRegisterUses();
+    }
+
+    if (dstCount == 1)
+    {
+        BuildDef(intrinsicTree, dstCandidates);
+    }
+    else
+    {
+        assert(dstCount == 0);
+    }
+
+    return srcCount;
+}
+#endif
+
+//------------------------------------------------------------------------
+// BuildCast: Set the NodeInfo for a GT_CAST.
+//
+// Arguments:
+//    cast - The GT_CAST node
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildCast(GenTreeCast* cast)
+{
+    GenTree* src = cast->gtGetOp1();
+
+    const var_types srcType  = genActualType(src->TypeGet());
+    const var_types castType = cast->gtCastType;
+
+    regMaskTP candidates = RBM_NONE;
+#ifdef TARGET_X86
+    if (varTypeIsByte(castType))
+    {
+        candidates = allByteRegs();
+    }
+
+    assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained()));
+#else
+    // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary
+    // register to extract the upper 32 bits of the 64 bit source register.
+    if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT))
+    {
+        // Here we don't need internal register to be different from targetReg,
+        // rather require it to be different from operand's reg.
+        buildInternalIntRegisterDefForNode(cast);
+    }
+#endif
+
+    int srcCount = BuildOperandUses(src, candidates);
+    buildInternalRegisterUses();
+    BuildDef(cast, candidates);
+    return srcCount;
+}
+
+//-----------------------------------------------------------------------------------------
+// BuildIndir: Specify register requirements for address expression of an indirection operation.
+//
+// Arguments:
+//    indirTree    -   GT_IND or GT_STOREIND gentree node
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildIndir(GenTreeIndir* indirTree)
+{
+    assert(false);
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// BuildMul: Set the NodeInfo for a multiply.
+//
+// Arguments:
+//    tree      - The node of interest
+//
+// Return Value:
+//    The number of sources consumed by this node.
+//
+int LinearScan::BuildMul(GenTree* tree)
+{
+    assert(tree->OperIsMul());
+    GenTree* op1 = tree->gtGetOp1();
+    GenTree* op2 = tree->gtGetOp2();
+
+    // Only non-floating point mul has special requirements
+    if (varTypeIsFloating(tree->TypeGet()))
+    {
+        return BuildSimple(tree);
+    }
+
+    int       srcCount      = BuildBinaryUses(tree->AsOp());
+    int       dstCount      = 1;
+    regMaskTP dstCandidates = RBM_NONE;
+
+    bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
+    bool requiresOverflowCheck = tree->gtOverflowEx();
+
+    // There are three forms of x86 multiply:
+    // one-op form:     RDX:RAX = RAX * r/m
+    // two-op form:     reg *= r/m
+    // three-op form:   reg = r/m * imm
+
+    // This special widening 32x32->64 MUL is not used on x64
+    CLANG_FORMAT_COMMENT_ANCHOR;
+#if defined(TARGET_X86)
+    if (tree->OperGet() != GT_MUL_LONG)
+#endif
+    {
+        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
+    }
+
+    // We do use the widening multiply to implement
+    // the overflow checking for unsigned multiply
+    //
+    if (isUnsignedMultiply && requiresOverflowCheck)
+    {
+        // The only encoding provided is RDX:RAX = RAX * rm
+        //
+        // Here we set RAX as the only destination candidate
+        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
+        //
+        dstCandidates = RBM_RAX;
+    }
+    else if (tree->OperGet() == GT_MULHI)
+    {
+        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
+        // upper 32 bits of the result set the destination candidate to REG_RDX.
+        dstCandidates = RBM_RDX;
+    }
+#if defined(TARGET_X86)
+    else if (tree->OperGet() == GT_MUL_LONG)
+    {
+        // have to use the encoding:RDX:RAX = RAX * rm
+        dstCandidates = RBM_RAX | RBM_RDX;
+        dstCount      = 2;
+    }
+#endif
+    GenTree* containedMemOp = nullptr;
+    if (op1->isContained() && !op1->IsCnsIntOrI())
+    {
+        assert(!op2->isContained() || op2->IsCnsIntOrI());
+        containedMemOp = op1;
+    }
+    else if (op2->isContained() && !op2->IsCnsIntOrI())
+    {
+        containedMemOp = op2;
+    }
+    regMaskTP killMask = getKillSetForMul(tree->AsOp());
+    BuildDefsWithKills(tree, dstCount, dstCandidates, killMask);
+    return srcCount;
+}
+
+//------------------------------------------------------------------------------
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
+//
+// Arguments:
+//    isFloatingPointType   - true if it is floating point type
+//    sizeOfSIMDVector      - SIMD Vector size
+//
+void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/)
+{
+    assert(false);
+}
+
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 639129a38bad..ab274a1b923b 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -3065,6 +3065,9 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
 #elif defined(TARGET_X86)
 
         passUsingFloatRegs = false;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+        passUsingFloatRegs = varTypeIsFloating(argx);
 
 #else
 #error Unsupported or unset target architecture
@@ -3115,7 +3118,7 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
                 assert(structSize == info.compCompHnd->getClassSize(objClass));
             }
         }
-#if defined(TARGET_AMD64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
 #ifdef UNIX_AMD64_ABI
         if (!isStructArg)
         {
diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h
index d06bef0cea1d..c0d565416cce 100644
--- a/src/coreclr/jit/register.h
+++ b/src/coreclr/jit/register.h
@@ -12,7 +12,8 @@
 #define REGALIAS(alias, realname)
 #endif
 
-#if defined(TARGET_XARCH)
+// TODO: WASM doesn't have these but can't compile without them
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
 #if defined(TARGET_X86)
 /*
@@ -69,6 +70,9 @@ REGALIAS(EDI, RDI)
 #ifdef TARGET_AMD64
 #define XMMBASE 16
 #define XMMMASK(x) (__int64(1) << ((x)+XMMBASE))
+#elif defined(TARGET_WASM32) || defined(TARGET_WSM64)
+#define XMMBASE 16
+#define XMMMASK(x) (__int64(1) << ((x)+XMMBASE))
 #else // !TARGET_AMD64
 #define XMMBASE 8
 #define XMMMASK(x) (__int32(1) << ((x)+XMMBASE))
@@ -103,6 +107,7 @@ REGDEF(STK,    16+XMMBASE,  0x0000,       "STK"  )
 #elif defined(TARGET_ARM64)
  #include "registerarm64.h"
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
 #else
   #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index 07d70e20d503..235b6a99d626 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -29,6 +29,21 @@ enum SIMDLevel
     // Floating-point instructions are legacy SSE encoded.
     SIMD_SSE4_Supported = 2,
 
+    // AVX2 - Hardware has AVX and AVX2 instruction set.
+    // Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
+    // Floating-point instructions are VEX-128 encoded.
+    SIMD_AVX2_Supported = 3
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    // SSE2 - The min bar of SIMD ISA on x86/x64.
+    // Vector<T> length is 128-bit.
+    // Floating-point instructions are legacy SSE encoded.
+    SIMD_SSE2_Supported = 1,
+
+    // SSE4 - RyuJIT may generate SSE3, SSSE3, SSE4.1 and SSE4.2 instructions for certain intrinsics.
+    // Vector<T> length is 128-bit.
+    // Floating-point instructions are legacy SSE encoded.
+    SIMD_SSE4_Supported = 2,
+
     // AVX2 - Hardware has AVX and AVX2 instruction set.
     // Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
     // Floating-point instructions are VEX-128 encoded.
diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h
index d4d501e5fd72..10df08e0b673 100644
--- a/src/coreclr/jit/target.h
+++ b/src/coreclr/jit/target.h
@@ -22,6 +22,10 @@
 #define TARGET_READABLE_NAME "ARM"
 #elif defined(TARGET_ARM64)
 #define TARGET_READABLE_NAME "ARM64"
+#elif defined(TARGET_WASM64)
+#define TARGET_READABLE_NAME "WASM64"
+#elif defined(TARGET_WASM32)
+#define TARGET_READABLE_NAME "WASM32"
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -41,6 +45,9 @@
 #define REGMASK_BITS 64
 #define CSE_CONST_SHARED_LOW_BITS 12
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#define REGMASK_BITS 32
+#define CSE_CONST_SHARED_LOW_BITS 16
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -141,6 +148,26 @@ enum _regMask_enum : unsigned
 #include "register.h"
 };
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+enum _regNumber_enum : unsigned
+{
+#define REGDEF(name, rnum, mask, sname) REG_##name = rnum,
+#define REGALIAS(alias, realname) REG_##alias = REG_##realname,
+#include "register.h"
+
+    REG_COUNT,
+    REG_NA           = REG_COUNT,
+    ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs)
+};
+
+enum _regMask_enum : unsigned
+{
+    RBM_NONE = 0,
+
+#define REGDEF(name, rnum, mask, sname) RBM_##name = mask,
+#define REGALIAS(alias, realname) RBM_##alias = RBM_##realname,
+#include "register.h"
+};
 #else
 #error Unsupported target architecture
 #endif
@@ -1556,6 +1583,748 @@ typedef unsigned char   regNumberSmall;
   // have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).
   #define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)  // TODO: a copy of X64
+#define RBM_LNGRET_LO            RBM_EAX
+#define REG_LNGRET_HI            REG_EDX
+#define RBM_LNGRET_HI            RBM_EDX
+
+#if defined(TARGET_WASM32)
+#define TARGET_POINTER_SIZE      4       // equal to sizeof(void*) and the managed pointer size in bytes for this
+// target
+#else
+#define TARGET_POINTER_SIZE      8
+#endif 
+
+
+// TODO-AMD64-CQ: Fine tune the following xxBlk threshold values:
+
+#define CPU_LOAD_STORE_ARCH      0
+#define CPU_HAS_FP_SUPPORT       1
+#define ROUND_FLOAT              0       // Do not round intermed float expression results
+#define CPU_HAS_BYTE_REGS        0
+
+#define CPBLK_UNROLL_LIMIT       64      // Upper bound to let the code generator to loop unroll CpBlk.
+#define INITBLK_UNROLL_LIMIT     128     // Upper bound to let the code generator to loop unroll InitBlk.
+#define CPOBJ_NONGC_SLOTS_LIMIT  4       // For CpObj code generation, this is the the threshold of the number
+                                         // of contiguous non-gc slots that trigger generating rep movsq instead of
+                                         // sequences of movsq instructions
+
+#ifdef FEATURE_SIMD
+#define ALIGN_SIMD_TYPES         1       // whether SIMD type locals are to be aligned
+#if defined(UNIX_AMD64_ABI)
+#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 0 // Whether SIMD registers are partially saved at calls
+#else // !UNIX_AMD64_ABI
+#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls
+#endif // !UNIX_AMD64_ABI
+#endif
+#define FEATURE_FIXED_OUT_ARGS   1       // Preallocate the outgoing arg area in the prolog
+#define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+#define FEATURE_FASTTAILCALL     1       // Tail calls made as epilog+jmp
+#define FEATURE_TAILCALL_OPT     1       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
+#define FEATURE_SET_FLAGS        0       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
+#define MAX_PASS_SINGLEREG_BYTES      8  // Maximum size of a struct passed in a single register (double).
+#ifdef    UNIX_AMD64_ABI
+#define FEATURE_MULTIREG_ARGS_OR_RET  1  // Support for passing and/or returning single values in more than one register
+#define FEATURE_MULTIREG_ARGS         1  // Support for passing a single argument in more than one register
+#define FEATURE_MULTIREG_RET          1  // Support for returning a single value in more than one register
+#define FEATURE_MULTIREG_STRUCT_PROMOTE  1  // True when we want to promote fields of a multireg struct into registers
+#define FEATURE_STRUCT_CLASSIFIER     1  // Uses a classifier function to determine if structs are passed/returned in more than one register
+#define MAX_PASS_MULTIREG_BYTES      32  // Maximum size of a struct that could be passed in more than one register (Max is two SIMD16s)
+#define MAX_RET_MULTIREG_BYTES       32  // Maximum size of a struct that could be returned in more than one register  (Max is two SIMD16s)
+#define MAX_ARG_REG_COUNT             2  // Maximum registers used to pass a single argument in multiple registers.
+#define MAX_RET_REG_COUNT             2  // Maximum registers used to return a value.
+
+#define MAX_MULTIREG_COUNT            2  // Maxiumum number of registers defined by a single instruction (including calls).
+                                         // This is also the maximum number of registers for a MultiReg node.
+#else // !UNIX_AMD64_ABI
+#define WINDOWS_AMD64_ABI                // Uses the Windows ABI for AMD64
+#define FEATURE_MULTIREG_ARGS_OR_RET  0  // Support for passing and/or returning single values in more than one register
+#define FEATURE_MULTIREG_ARGS         0  // Support for passing a single argument in more than one register
+#define FEATURE_MULTIREG_RET          0  // Support for returning a single value in more than one register
+#define FEATURE_MULTIREG_STRUCT_PROMOTE  0  // True when we want to promote fields of a multireg struct into registers
+#define MAX_PASS_MULTIREG_BYTES       0  // No multireg arguments
+#define MAX_RET_MULTIREG_BYTES        0  // No multireg return values
+#define MAX_ARG_REG_COUNT             1  // Maximum registers used to pass a single argument (no arguments are passed using multiple registers)
+#define MAX_RET_REG_COUNT             1  // Maximum registers used to return a value.
+
+#define MAX_MULTIREG_COUNT            2  // Maxiumum number of registers defined by a single instruction (including calls).
+                                         // This is also the maximum number of registers for a MultiReg node.
+                                         // Note that this must be greater than 1 so that GenTreeLclVar can have an array of
+                                         // MAX_MULTIREG_COUNT - 1.
+#endif // !UNIX_AMD64_ABI
+
+#define NOGC_WRITE_BARRIERS      0       // We DO-NOT have specialized WriteBarrier JIT Helpers that DO-NOT trash the RBM_CALLEE_TRASH registers
+#define USER_ARGS_COME_LAST      1
+#define EMIT_TRACK_STACK_DEPTH   1
+
+#if defined(TARGET_WASM32)
+#define TARGET_POINTER_SIZE      4       // equal to sizeof(void*) and the managed pointer size in bytes for this target
+#else
+#define TARGET_POINTER_SIZE      8
+#endif 
+
+#define FEATURE_EH               1       // To aid platform bring-up, eliminate exceptional EH clauses (catch, filter, filter-handler, fault) and directly execute 'finally' clauses.
+#define FEATURE_EH_CALLFINALLY_THUNKS 1  // Generate call-to-finally code in "thunks" in the enclosing EH region, protected by "cloned finally" clauses.
+#ifdef    UNIX_AMD64_ABI
+#define ETW_EBP_FRAMED           1       // if 1 we cannot use EBP as a scratch register and must create EBP based frames for most methods
+#else // !UNIX_AMD64_ABI
+#define ETW_EBP_FRAMED           0       // if 1 we cannot use EBP as a scratch register and must create EBP based frames for most methods
+#endif // !UNIX_AMD64_ABI
+#define CSE_CONSTS               1       // Enable if we want to CSE constants
+
+#define RBM_ALLFLOAT            (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15)
+#define RBM_ALLDOUBLE            RBM_ALLFLOAT
+#define REG_FP_FIRST             REG_XMM0
+#define REG_FP_LAST              REG_XMM15
+#define FIRST_FP_ARGREG          REG_XMM0
+
+#ifdef    UNIX_AMD64_ABI
+#define LAST_FP_ARGREG        REG_XMM7
+#else // !UNIX_AMD64_ABI
+#define LAST_FP_ARGREG        REG_XMM3
+#endif // !UNIX_AMD64_ABI
+
+#define REGNUM_BITS              6       // number of bits in a REG_*
+#define REGMASK_BITS             32      // number of bits in a REGNUM_MASK
+#if defined(TARGET_WASM32)               // morph phase uses this
+#define REGSIZE_BYTES            4       // number of bytes in one register
+#else
+#define REGSIZE_BYTES            8       // number of bytes in one register
+#endif
+#define XMM_REGSIZE_BYTES        16      // XMM register size in bytes
+#define YMM_REGSIZE_BYTES        32      // YMM register size in bytes
+
+#define CODE_ALIGN               1       // code alignment requirement
+#define STACK_ALIGN              16      // stack alignment requirement
+#define STACK_ALIGN_SHIFT        4       // Shift-right amount to convert size in bytes to size in STACK_ALIGN units == log2(STACK_ALIGN)
+
+#if ETW_EBP_FRAMED
+#define RBM_ETW_FRAMED_EBP        RBM_NONE
+#define RBM_ETW_FRAMED_EBP_LIST
+#define REG_ETW_FRAMED_EBP_LIST
+#define REG_ETW_FRAMED_EBP_COUNT  0
+#else // !ETW_EBP_FRAMED
+#define RBM_ETW_FRAMED_EBP        RBM_EBP
+#define RBM_ETW_FRAMED_EBP_LIST   RBM_EBP,
+#define REG_ETW_FRAMED_EBP_LIST   REG_EBP,
+#define REG_ETW_FRAMED_EBP_COUNT  1
+#endif // !ETW_EBP_FRAMED
+
+#ifdef UNIX_AMD64_ABI
+#define MIN_ARG_AREA_FOR_CALL   0       // Minimum required outgoing argument space for a call.
+
+#define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15)
+#define RBM_INT_CALLEE_TRASH    (RBM_EAX|RBM_RDI|RBM_RSI|RBM_EDX|RBM_ECX|RBM_R8|RBM_R9|RBM_R10|RBM_R11)
+#define RBM_FLT_CALLEE_SAVED    (0)
+#define RBM_FLT_CALLEE_TRASH    (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \
+                                   RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15)
+#define REG_PROFILER_ENTER_ARG_0 REG_R14
+#define RBM_PROFILER_ENTER_ARG_0 RBM_R14
+#define REG_PROFILER_ENTER_ARG_1 REG_R15
+#define RBM_PROFILER_ENTER_ARG_1 RBM_R15
+
+#define REG_DEFAULT_PROFILER_CALL_TARGET REG_R11
+
+#else // !UNIX_AMD64_ABI
+#define MIN_ARG_AREA_FOR_CALL     (4 * REGSIZE_BYTES)       // Minimum required outgoing argument space for a call.
+
+#define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15)
+#define RBM_INT_CALLEE_TRASH    (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11)
+#define RBM_FLT_CALLEE_SAVED    (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15)
+#define RBM_FLT_CALLEE_TRASH    (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5)
+#endif // !UNIX_AMD64_ABI
+
+#define REG_FLT_CALLEE_SAVED_FIRST   REG_XMM6
+#define REG_FLT_CALLEE_SAVED_LAST    REG_XMM15
+
+#define RBM_CALLEE_TRASH        (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH)
+#define RBM_CALLEE_SAVED        (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED)
+
+#define RBM_CALLEE_TRASH_NOGC   RBM_CALLEE_TRASH
+
+#define RBM_ALLINT              (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH)
+
+#if 0
+#define REG_VAR_ORDER            REG_EAX,REG_EDX,REG_ECX,REG_ESI,REG_EDI,REG_EBX,REG_ETW_FRAMED_EBP_LIST \
+                                 REG_R8,REG_R9,REG_R10,REG_R11,REG_R14,REG_R15,REG_R12,REG_R13
+#else
+  // TEMPORARY ORDER TO AVOID CALLEE-SAVES
+  // TODO-CQ: Review this and set appropriately
+#ifdef UNIX_AMD64_ABI
+#define REG_VAR_ORDER          REG_EAX,REG_EDI,REG_ESI, \
+                                 REG_EDX,REG_ECX,REG_R8,REG_R9, \
+                                 REG_R10,REG_R11,REG_EBX,REG_ETW_FRAMED_EBP_LIST \
+                                 REG_R14,REG_R15,REG_R12,REG_R13
+#else // !UNIX_AMD64_ABI
+#define REG_VAR_ORDER          REG_EAX,REG_EDX,REG_ECX, \
+                                 REG_R8,REG_R9,REG_R10,REG_R11, \
+                                 REG_ESI,REG_EDI,REG_EBX,REG_ETW_FRAMED_EBP_LIST \
+                                 REG_R14,REG_R15,REG_R12,REG_R13
+#endif // !UNIX_AMD64_ABI
+#endif
+
+#define REG_VAR_ORDER_FLT      REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15
+
+#ifdef UNIX_AMD64_ABI
+#define CNT_CALLEE_SAVED         (5 + REG_ETW_FRAMED_EBP_COUNT)
+#define CNT_CALLEE_TRASH         (9)
+#define CNT_CALLEE_ENREG         (CNT_CALLEE_SAVED)
+
+#define CNT_CALLEE_SAVED_FLOAT   (0)
+#define CNT_CALLEE_TRASH_FLOAT   (16)
+
+#define REG_CALLEE_SAVED_ORDER   REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15
+#define RBM_CALLEE_SAVED_ORDER   RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15
+#else // !UNIX_AMD64_ABI
+#define CNT_CALLEE_SAVED         (7 + REG_ETW_FRAMED_EBP_COUNT)
+#define CNT_CALLEE_TRASH         (7)
+#define CNT_CALLEE_ENREG         (CNT_CALLEE_SAVED)
+
+#define CNT_CALLEE_SAVED_FLOAT   (10)
+#define CNT_CALLEE_TRASH_FLOAT   (6)
+
+#define REG_CALLEE_SAVED_ORDER   REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15
+#define RBM_CALLEE_SAVED_ORDER   RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15
+#endif // !UNIX_AMD64_ABI
+
+#define CALLEE_SAVED_REG_MAXSZ   (CNT_CALLEE_SAVED*REGSIZE_BYTES)
+#define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16)
+
+// register to hold shift amount
+#define REG_SHIFT                REG_ECX
+#define RBM_SHIFT                RBM_ECX
+
+// This is a general scratch register that does not conflict with the argument registers
+#define REG_SCRATCH              REG_EAX
+
+// Where is the exception object on entry to the handler block?
+#ifdef UNIX_AMD64_ABI
+#define REG_EXCEPTION_OBJECT     REG_ESI
+#define RBM_EXCEPTION_OBJECT     RBM_ESI
+#else // !UNIX_AMD64_ABI
+#define REG_EXCEPTION_OBJECT     REG_EDX
+#define RBM_EXCEPTION_OBJECT     RBM_EDX
+#endif // !UNIX_AMD64_ABI
+
+#define REG_JUMP_THUNK_PARAM     REG_EAX
+#define RBM_JUMP_THUNK_PARAM     RBM_EAX
+
+// Register to be used for emitting helper calls whose call target is an indir of an
+// absolute memory address in case of Rel32 overflow i.e. a data address could not be
+// encoded as PC-relative 32-bit offset.
+//
+// Notes:
+// 1) that RAX is callee trash register that is not used for passing parameter and
+//    also results in smaller instruction encoding.
+// 2) Profiler Leave callback requires the return value to be preserved
+//    in some form.  We can use custom calling convention for Leave callback.
+//    For e.g return value could be preserved in rcx so that it is available for
+//    profiler.
+#define REG_DEFAULT_HELPER_CALL_TARGET    REG_RAX
+#define RBM_DEFAULT_HELPER_CALL_TARGET    RBM_RAX
+
+// GenericPInvokeCalliHelper VASigCookie Parameter
+#define REG_PINVOKE_COOKIE_PARAM          REG_R11
+#define RBM_PINVOKE_COOKIE_PARAM          RBM_R11
+
+// GenericPInvokeCalliHelper unmanaged target Parameter
+#define REG_PINVOKE_TARGET_PARAM          REG_R10
+#define RBM_PINVOKE_TARGET_PARAM          RBM_R10
+
+// IL stub's secret MethodDesc parameter (JitFlags::JIT_FLAG_PUBLISH_SECRET_PARAM)
+#define REG_SECRET_STUB_PARAM    REG_R10
+#define RBM_SECRET_STUB_PARAM    RBM_R10
+
+// Registers used by PInvoke frame setup
+#define REG_PINVOKE_FRAME        REG_EDI
+#define RBM_PINVOKE_FRAME        RBM_EDI
+#define REG_PINVOKE_TCB          REG_EAX
+#define RBM_PINVOKE_TCB          RBM_EAX
+#define REG_PINVOKE_SCRATCH      REG_EAX
+#define RBM_PINVOKE_SCRATCH      RBM_EAX
+
+// The following defines are useful for iterating a regNumber
+#define REG_FIRST                REG_EAX
+#define REG_INT_FIRST            REG_EAX
+#define REG_INT_LAST             REG_R15
+#define REG_INT_COUNT            (REG_INT_LAST - REG_INT_FIRST + 1)
+#define REG_NEXT(reg)           ((regNumber)((unsigned)(reg) + 1))
+#define REG_PREV(reg)           ((regNumber)((unsigned)(reg) - 1))
+
+// Which register are int and long values returned in ?
+#define REG_INTRET               REG_EAX
+#define RBM_INTRET               RBM_EAX
+
+#define RBM_LNGRET               RBM_EAX
+
+#ifdef UNIX_AMD64_ABI
+#define REG_INTRET_1           REG_RDX
+#define RBM_INTRET_1           RBM_RDX
+
+#define REG_LNGRET_1           REG_RDX
+#define RBM_LNGRET_1           RBM_RDX
+#endif // UNIX_AMD64_ABI
+
+
+#define REG_FLOATRET             REG_XMM0
+#define RBM_FLOATRET             RBM_XMM0
+#define REG_DOUBLERET            REG_XMM0
+#define RBM_DOUBLERET            RBM_XMM0
+
+#ifdef UNIX_AMD64_ABI
+#define REG_FLOATRET_1             REG_XMM1
+#define RBM_FLOATRET_1             RBM_XMM1
+
+#define REG_DOUBLERET_1            REG_XMM1
+#define RBM_DOUBLERET_1            RBM_XMM1
+#endif // UNIX_AMD64_ABI
+
+#define REG_FPBASE               REG_EBP
+#define RBM_FPBASE               RBM_EBP
+#define STR_FPBASE               "rbp"
+#define REG_SPBASE               REG_ESP
+#define RBM_SPBASE               RBM_ESP
+#define STR_SPBASE               "rsp"
+
+#define FIRST_ARG_STACK_OFFS     (REGSIZE_BYTES)   // return address
+
+#ifdef UNIX_AMD64_ABI
+#define MAX_REG_ARG              6
+#define MAX_FLOAT_REG_ARG        8
+#define REG_ARG_FIRST            REG_EDI
+#define REG_ARG_LAST             REG_R9
+#define INIT_ARG_STACK_SLOT      0                  // No outgoing reserved stack slots
+
+#define REG_ARG_0                REG_EDI
+#define REG_ARG_1                REG_ESI
+#define REG_ARG_2                REG_EDX
+#define REG_ARG_3                REG_ECX
+#define REG_ARG_4                REG_R8
+#define REG_ARG_5                REG_R9
+
+extern const regNumber intArgRegs[MAX_REG_ARG];
+extern const regMaskTP intArgMasks[MAX_REG_ARG];
+extern const regNumber fltArgRegs[MAX_FLOAT_REG_ARG];
+extern const regMaskTP fltArgMasks[MAX_FLOAT_REG_ARG];
+
+#define RBM_ARG_0                RBM_RDI
+#define RBM_ARG_1                RBM_RSI
+#define RBM_ARG_2                RBM_EDX
+#define RBM_ARG_3                RBM_ECX
+#define RBM_ARG_4                RBM_R8
+#define RBM_ARG_5                RBM_R9
+#else // !UNIX_AMD64_ABI
+#define MAX_REG_ARG              4
+#define MAX_FLOAT_REG_ARG        4
+#define REG_ARG_FIRST            REG_ECX
+#define REG_ARG_LAST             REG_R9
+#define INIT_ARG_STACK_SLOT      4                  // 4 outgoing reserved stack slots
+
+#define REG_ARG_0                REG_ECX
+#define REG_ARG_1                REG_EDX
+#define REG_ARG_2                REG_R8
+#define REG_ARG_3                REG_R9
+
+extern const regNumber intArgRegs[MAX_REG_ARG];
+extern const regMaskTP intArgMasks[MAX_REG_ARG];
+extern const regNumber fltArgRegs[MAX_FLOAT_REG_ARG];
+extern const regMaskTP fltArgMasks[MAX_FLOAT_REG_ARG];
+
+#define RBM_ARG_0                RBM_ECX
+#define RBM_ARG_1                RBM_EDX
+#define RBM_ARG_2                RBM_R8
+#define RBM_ARG_3                RBM_R9
+#endif // !UNIX_AMD64_ABI
+
+#define REG_FLTARG_0             REG_XMM0
+#define REG_FLTARG_1             REG_XMM1
+#define REG_FLTARG_2             REG_XMM2
+#define REG_FLTARG_3             REG_XMM3
+
+#define RBM_FLTARG_0             RBM_XMM0
+#define RBM_FLTARG_1             RBM_XMM1
+#define RBM_FLTARG_2             RBM_XMM2
+#define RBM_FLTARG_3             RBM_XMM3
+
+#ifdef UNIX_AMD64_ABI
+#define REG_FLTARG_4             REG_XMM4
+#define REG_FLTARG_5             REG_XMM5
+#define REG_FLTARG_6             REG_XMM6
+#define REG_FLTARG_7             REG_XMM7
+
+#define RBM_FLTARG_4             RBM_XMM4
+#define RBM_FLTARG_5             RBM_XMM5
+#define RBM_FLTARG_6             RBM_XMM6
+#define RBM_FLTARG_7             RBM_XMM7
+
+#define RBM_ARG_REGS            (RBM_ARG_0|RBM_ARG_1|RBM_ARG_2|RBM_ARG_3|RBM_ARG_4|RBM_ARG_5)
+#define RBM_FLTARG_REGS         (RBM_FLTARG_0|RBM_FLTARG_1|RBM_FLTARG_2|RBM_FLTARG_3|RBM_FLTARG_4|RBM_FLTARG_5|RBM_FLTARG_6|RBM_FLTARG_7)
+#else // !UNIX_AMD64_ABI
+#define RBM_ARG_REGS            (RBM_ARG_0|RBM_ARG_1|RBM_ARG_2|RBM_ARG_3)
+#define RBM_FLTARG_REGS         (RBM_FLTARG_0|RBM_FLTARG_1|RBM_FLTARG_2|RBM_FLTARG_3)
+#endif // !UNIX_AMD64_ABI
+
+// The registers trashed by profiler enter/leave/tailcall hook
+// See vm\amd64\asmhelpers.asm for more details.
+#define RBM_PROFILER_ENTER_TRASH     RBM_CALLEE_TRASH
+#define RBM_PROFILER_TAILCALL_TRASH  RBM_PROFILER_LEAVE_TRASH
+
+// The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper.
+#ifdef UNIX_AMD64_ABI
+  // See vm\amd64\unixasmhelpers.S for more details.
+  //
+  // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers.
+  // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }.
+  // STOP_FOR_GC helper preserves all the 4 possible return registers.
+#define RBM_STOP_FOR_GC_TRASH     (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1))
+#define RBM_PROFILER_LEAVE_TRASH  (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1))
+#else
+  // See vm\amd64\asmhelpers.asm for more details.
+#define RBM_STOP_FOR_GC_TRASH     (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET))
+#define RBM_PROFILER_LEAVE_TRASH  (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET))
+#endif
+
+  // The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper.
+#define RBM_INIT_PINVOKE_FRAME_TRASH  RBM_CALLEE_TRASH
+
+// What sort of reloc do we use for [disp32] address mode
+#define IMAGE_REL_BASED_DISP32   IMAGE_REL_BASED_REL32
+
+// What sort of reloc to we use for 'moffset' address mode (for 'mov eax, moffset' or 'mov moffset, eax')
+#define IMAGE_REL_BASED_MOFFSET  IMAGE_REL_BASED_DIR64
+
+// Pointer-sized string move instructions
+#define INS_movsp                INS_movsq
+#define INS_r_movsp              INS_r_movsq
+#define INS_stosp                INS_stosq
+#define INS_r_stosp              INS_r_stosq
+
+// AMD64 uses FEATURE_FIXED_OUT_ARGS so this can be zero.
+#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0
+
+#define REG_STACK_PROBE_HELPER_ARG   REG_R11
+#define RBM_STACK_PROBE_HELPER_ARG   RBM_R11
+
+#ifdef TARGET_UNIX
+#define RBM_STACK_PROBE_HELPER_TRASH RBM_NONE
+#else // !TARGET_UNIX
+#define RBM_STACK_PROBE_HELPER_TRASH RBM_RAX
+#endif // !TARGET_UNIX
+
+#elif defined(TARGET_ARM)
+
+// TODO-ARM-CQ: Use shift for division by power of 2
+// TODO-ARM-CQ: Check for sdiv/udiv at runtime and generate it if available
+#define USE_HELPERS_FOR_INT_DIV  1       // BeagleBoard (ARMv7A) doesn't support SDIV/UDIV
+#define CPU_LOAD_STORE_ARCH      1
+#define CPU_HAS_FP_SUPPORT       1
+#define ROUND_FLOAT              0       // Do not round intermed float expression results
+#define CPU_HAS_BYTE_REGS        0
+
+#define CPBLK_UNROLL_LIMIT       32      // Upper bound to let the code generator to loop unroll CpBlk.
+#define INITBLK_UNROLL_LIMIT     16      // Upper bound to let the code generator to loop unroll InitBlk.
+
+#define FEATURE_FIXED_OUT_ARGS   1       // Preallocate the outgoing arg area in the prolog
+#define FEATURE_STRUCTPROMOTE    1       // JIT Optimization to promote fields of structs into registers
+#define FEATURE_MULTIREG_STRUCT_PROMOTE  0  // True when we want to promote fields of a multireg struct into registers
+#define FEATURE_FASTTAILCALL     0       // Tail calls made as epilog+jmp
+#define FEATURE_TAILCALL_OPT     0       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
+#define FEATURE_SET_FLAGS        1       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
+#define FEATURE_MULTIREG_ARGS_OR_RET  1  // Support for passing and/or returning single values in more than one register (including HFA support)
+#define FEATURE_MULTIREG_ARGS         1  // Support for passing a single argument in more than one register (including passing HFAs)
+#define FEATURE_MULTIREG_RET          1  // Support for returning a single value in more than one register (including HFA returns)
+#define FEATURE_STRUCT_CLASSIFIER     0  // Uses a classifier function to determine is structs are passed/returned in more than one register
+#define MAX_PASS_SINGLEREG_BYTES      8  // Maximum size of a struct passed in a single register (double).
+#define MAX_PASS_MULTIREG_BYTES      32  // Maximum size of a struct that could be passed in more than one register (Max is an HFA of 4 doubles)
+#define MAX_RET_MULTIREG_BYTES       32  // Maximum size of a struct that could be returned in more than one register (Max is an HFA of 4 doubles)
+#define MAX_ARG_REG_COUNT             4  // Maximum registers used to pass a single argument in multiple registers. (max is 4 floats or doubles using an HFA)
+#define MAX_RET_REG_COUNT             4  // Maximum registers used to return a value.
+
+#define MAX_MULTIREG_COUNT            4  // Maxiumum number of registers defined by a single instruction (including calls).
+                                         // This is also the maximum number of registers for a MultiReg node.
+
+#define NOGC_WRITE_BARRIERS      0       // We DO-NOT have specialized WriteBarrier JIT Helpers that DO-NOT trash the RBM_CALLEE_TRASH registers
+#define USER_ARGS_COME_LAST      1
+#define EMIT_TRACK_STACK_DEPTH   1       // This is something of a workaround.  For both ARM and AMD64, the frame size is fixed, so we don't really
+                                         // need to track stack depth, but this is currently necessary to get GC information reported at call sites.
+#define TARGET_POINTER_SIZE      4       // equal to sizeof(void*) and the managed pointer size in bytes for this target
+#define FEATURE_EH               1       // To aid platform bring-up, eliminate exceptional EH clauses (catch, filter, filter-handler, fault) and directly execute 'finally' clauses.
+#define FEATURE_EH_CALLFINALLY_THUNKS 0  // Generate call-to-finally code in "thunks" in the enclosing EH region, protected by "cloned finally" clauses.
+#define ETW_EBP_FRAMED           1       // if 1 we cannot use REG_FP as a scratch register and must setup the frame pointer for most methods
+#define CSE_CONSTS               1       // Enable if we want to CSE constants
+
+#define REG_FP_FIRST             REG_F0
+#define REG_FP_LAST              REG_F31
+#define FIRST_FP_ARGREG          REG_F0
+#define LAST_FP_ARGREG           REG_F15
+
+#define REGNUM_BITS              6       // number of bits in a REG_*
+#define REGMASK_BITS             64      // number of bits in a REGNUM_MASK
+#define REGSIZE_BYTES            4       // number of bytes in one register
+#define MIN_ARG_AREA_FOR_CALL    0       // Minimum required outgoing argument space for a call.
+
+#define CODE_ALIGN               2       // code alignment requirement
+#define STACK_ALIGN              8       // stack alignment requirement
+
+#define RBM_INT_CALLEE_SAVED    (RBM_R4|RBM_R5|RBM_R6|RBM_R7|RBM_R8|RBM_R9|RBM_R10)
+#define RBM_INT_CALLEE_TRASH    (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R12|RBM_LR)
+#define RBM_FLT_CALLEE_SAVED    (RBM_F16|RBM_F17|RBM_F18|RBM_F19|RBM_F20|RBM_F21|RBM_F22|RBM_F23|RBM_F24|RBM_F25|RBM_F26|RBM_F27|RBM_F28|RBM_F29|RBM_F30|RBM_F31)
+#define RBM_FLT_CALLEE_TRASH    (RBM_F0|RBM_F1|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7|RBM_F8|RBM_F9|RBM_F10|RBM_F11|RBM_F12|RBM_F13|RBM_F14|RBM_F15)
+
+#define RBM_CALLEE_SAVED        (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED)
+#define RBM_CALLEE_TRASH        (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH)
+
+#define REG_DEFAULT_HELPER_CALL_TARGET REG_R12
+#define RBM_DEFAULT_HELPER_CALL_TARGET RBM_R12
+
+#define REG_FASTTAILCALL_TARGET REG_R12   // Target register for fast tail call
+#define RBM_FASTTAILCALL_TARGET RBM_R12
+
+#define RBM_ALLINT              (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH)
+#define RBM_ALLFLOAT            (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH)
+#define RBM_ALLDOUBLE           (RBM_F0|RBM_F2|RBM_F4|RBM_F6|RBM_F8|RBM_F10|RBM_F12|RBM_F14|RBM_F16|RBM_F18|RBM_F20|RBM_F22|RBM_F24|RBM_F26|RBM_F28|RBM_F30)
+
+#define REG_VAR_ORDER            REG_R3,REG_R2,REG_R1,REG_R0,REG_R4,REG_LR,REG_R12,\
+                                   REG_R5,REG_R6,REG_R7,REG_R8,REG_R9,REG_R10
+
+#define REG_VAR_ORDER_FLT        REG_F8,  REG_F9,  REG_F10, REG_F11, \
+                                   REG_F12, REG_F13, REG_F14, REG_F15, \
+                                   REG_F6,  REG_F7,  REG_F4,  REG_F5,  \
+                                   REG_F2,  REG_F3,  REG_F0,  REG_F1,  \
+                                   REG_F16, REG_F17, REG_F18, REG_F19, \
+                                   REG_F20, REG_F21, REG_F22, REG_F23, \
+                                   REG_F24, REG_F25, REG_F26, REG_F27, \
+                                   REG_F28, REG_F29, REG_F30, REG_F31,
+
+#define RBM_LOW_REGS            (RBM_R0|RBM_R1|RBM_R2|RBM_R3|RBM_R4|RBM_R5|RBM_R6|RBM_R7)
+#define RBM_HIGH_REGS           (RBM_R8|RBM_R9|RBM_R10|RBM_R11|RBM_R12|RBM_SP|RBM_LR|RBM_PC)
+
+#define REG_CALLEE_SAVED_ORDER   REG_R4,REG_R5,REG_R6,REG_R7,REG_R8,REG_R9,REG_R10,REG_R11
+#define RBM_CALLEE_SAVED_ORDER   RBM_R4,RBM_R5,RBM_R6,RBM_R7,RBM_R8,RBM_R9,RBM_R10,RBM_R11
+
+#define CNT_CALLEE_SAVED        (8)
+#define CNT_CALLEE_TRASH        (6)
+#define CNT_CALLEE_ENREG        (CNT_CALLEE_SAVED-1)
+
+#define CNT_CALLEE_SAVED_FLOAT  (16)
+#define CNT_CALLEE_TRASH_FLOAT  (16)
+
+#define CALLEE_SAVED_REG_MAXSZ    (CNT_CALLEE_SAVED*REGSIZE_BYTES)
+#define CALLEE_SAVED_FLOAT_MAXSZ  (CNT_CALLEE_SAVED_FLOAT*sizeof(float))
+
+// Temporary registers used for the GS cookie check.
+#define REG_GSCOOKIE_TMP_0       REG_R12
+#define REG_GSCOOKIE_TMP_1       REG_LR
+
+// register to hold shift amount; no special register is required on the ARM
+#define REG_SHIFT                REG_NA
+#define RBM_SHIFT                RBM_ALLINT
+
+// register to hold shift amount when shifting 64-bit values (this uses a helper call)
+#define REG_SHIFT_LNG            REG_R2            // REG_ARG_2
+#define RBM_SHIFT_LNG            RBM_R2            // RBM_ARG_2
+
+// This is a general scratch register that does not conflict with the argument registers
+#define REG_SCRATCH              REG_LR
+
+// This is a general register that can be optionally reserved for other purposes during codegen
+#define REG_OPT_RSVD             REG_R10
+#define RBM_OPT_RSVD             RBM_R10
+
+// We reserve R9 to store SP on entry for stack unwinding when localloc is used
+// This needs to stay in sync with the ARM version of InlinedCallFrame::UpdateRegDisplay code.
+#define REG_SAVED_LOCALLOC_SP    REG_R9
+#define RBM_SAVED_LOCALLOC_SP    RBM_R9
+
+// Where is the exception object on entry to the handler block?
+#define REG_EXCEPTION_OBJECT     REG_R0
+#define RBM_EXCEPTION_OBJECT     RBM_R0
+
+#define REG_JUMP_THUNK_PARAM     REG_R12
+#define RBM_JUMP_THUNK_PARAM     RBM_R12
+
+// ARM write barrier ABI (see vm\arm\asmhelpers.asm, vm\arm\asmhelpers.S):
+// CORINFO_HELP_ASSIGN_REF (JIT_WriteBarrier), CORINFO_HELP_CHECKED_ASSIGN_REF (JIT_CheckedWriteBarrier):
+//     On entry:
+//       r0: the destination address (LHS of the assignment)
+//       r1: the object reference (RHS of the assignment)
+//     On exit:
+//       r0: trashed
+//       r3: trashed
+// CORINFO_HELP_ASSIGN_BYREF (JIT_ByRefWriteBarrier):
+//     On entry:
+//       r0: the destination address (object reference written here)
+//       r1: the source address (points to object reference to write)
+//     On exit:
+//       r0: incremented by 4
+//       r1: incremented by 4
+//       r2: trashed
+//       r3: trashed
+
+#define REG_WRITE_BARRIER_DST_BYREF    REG_ARG_0
+#define RBM_WRITE_BARRIER_DST_BYREF    RBM_ARG_0
+
+#define REG_WRITE_BARRIER_SRC_BYREF    REG_ARG_1
+#define RBM_WRITE_BARRIER_SRC_BYREF    RBM_ARG_1
+
+#define RBM_CALLEE_TRASH_NOGC          (RBM_R2|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET)
+
+// Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF.
+#define RBM_CALLEE_TRASH_WRITEBARRIER         (RBM_R0|RBM_R3|RBM_LR|RBM_DEFAULT_HELPER_CALL_TARGET)
+
+// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF.
+#define RBM_CALLEE_GCTRASH_WRITEBARRIER       RBM_CALLEE_TRASH_WRITEBARRIER
+
+// Registers killed by CORINFO_HELP_ASSIGN_BYREF.
+#define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF   (RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC)
+
+// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF.
+// Note that r0 and r1 are still valid byref pointers after this helper call, despite their value being changed.
+#define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF RBM_CALLEE_TRASH_NOGC
+
+// GenericPInvokeCalliHelper VASigCookie Parameter
+#define REG_PINVOKE_COOKIE_PARAM          REG_R4
+#define RBM_PINVOKE_COOKIE_PARAM          RBM_R4
+
+// GenericPInvokeCalliHelper unmanaged target Parameter
+#define REG_PINVOKE_TARGET_PARAM          REG_R12
+#define RBM_PINVOKE_TARGET_PARAM          RBM_R12
+
+// IL stub's secret MethodDesc parameter (JitFlags::JIT_FLAG_PUBLISH_SECRET_PARAM)
+#define REG_SECRET_STUB_PARAM     REG_R12
+#define RBM_SECRET_STUB_PARAM     RBM_R12
+
+// R2R indirect call. Use the same registers as VSD
+#define REG_R2R_INDIRECT_PARAM          REG_R4
+#define RBM_R2R_INDIRECT_PARAM          RBM_R4
+
+// JMP Indirect call register
+#define REG_INDIRECT_CALL_TARGET_REG REG_R12
+
+// Registers used by PInvoke frame setup
+#define REG_PINVOKE_FRAME        REG_R4
+#define RBM_PINVOKE_FRAME        RBM_R4
+#define REG_PINVOKE_TCB          REG_R5
+#define RBM_PINVOKE_TCB          RBM_R5
+#define REG_PINVOKE_SCRATCH      REG_R6
+#define RBM_PINVOKE_SCRATCH      RBM_R6
+
+// The following defines are useful for iterating a regNumber
+#define REG_FIRST                REG_R0
+#define REG_INT_FIRST            REG_R0
+#define REG_INT_LAST             REG_LR
+#define REG_INT_COUNT            (REG_INT_LAST - REG_INT_FIRST + 1)
+#define REG_NEXT(reg)           ((regNumber)((unsigned)(reg) + 1))
+#define REG_PREV(reg)           ((regNumber)((unsigned)(reg) - 1))
+
+// The following registers are used in emitting Enter/Leave/Tailcall profiler callbacks
+#define REG_PROFILER_ENTER_ARG           REG_R0
+#define RBM_PROFILER_ENTER_ARG           RBM_R0
+#define REG_PROFILER_RET_SCRATCH         REG_R2
+#define RBM_PROFILER_RET_SCRATCH         RBM_R2
+
+// The registers trashed by profiler enter/leave/tailcall hook
+// See vm\arm\asmhelpers.asm for more details.
+#define RBM_PROFILER_ENTER_TRASH     RBM_NONE
+// While REG_PROFILER_RET_SCRATCH is not trashed by the method, the register allocator must
+// consider it killed by the return.
+#define RBM_PROFILER_LEAVE_TRASH     RBM_PROFILER_RET_SCRATCH
+#define RBM_PROFILER_TAILCALL_TRASH  RBM_NONE
+
+// Which register are int and long values returned in ?
+#define REG_INTRET               REG_R0
+#define RBM_INTRET               RBM_R0
+#define RBM_LNGRET              (RBM_R1|RBM_R0)
+#define REG_LNGRET_LO            REG_R0
+#define REG_LNGRET_HI            REG_R1
+#define RBM_LNGRET_LO            RBM_R0
+#define RBM_LNGRET_HI            RBM_R1
+
+#define REG_FLOATRET             REG_F0
+#define RBM_FLOATRET             RBM_F0
+#define RBM_DOUBLERET           (RBM_F0|RBM_F1)
+
+// The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper (JIT_RareDisableHelper).
+// See vm\arm\amshelpers.asm for more details.
+#define RBM_STOP_FOR_GC_TRASH     (RBM_CALLEE_TRASH & ~(RBM_LNGRET|RBM_R7|RBM_R8|RBM_R11|RBM_DOUBLERET|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7))
+
+// The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper.
+#define RBM_INIT_PINVOKE_FRAME_TRASH (RBM_CALLEE_TRASH | RBM_PINVOKE_TCB | RBM_PINVOKE_SCRATCH)
+
+#define REG_FPBASE               REG_R11
+#define RBM_FPBASE               RBM_R11
+#define STR_FPBASE               "r11"
+#define REG_SPBASE               REG_SP
+#define RBM_SPBASE               RBM_SP
+#define STR_SPBASE               "sp"
+
+#define FIRST_ARG_STACK_OFFS    (2*REGSIZE_BYTES)   // Caller's saved FP and return address
+
+#define MAX_REG_ARG              4
+#define MAX_FLOAT_REG_ARG        16
+#define MAX_HFA_RET_SLOTS        8
+
+#define REG_ARG_FIRST            REG_R0
+#define REG_ARG_LAST             REG_R3
+#define REG_ARG_FP_FIRST         REG_F0
+#define REG_ARG_FP_LAST          REG_F7
+#define INIT_ARG_STACK_SLOT      0                  // No outgoing reserved stack slots
+
+#define REG_ARG_0                REG_R0
+#define REG_ARG_1                REG_R1
+#define REG_ARG_2                REG_R2
+#define REG_ARG_3                REG_R3
+
+extern const regNumber intArgRegs[MAX_REG_ARG];
+extern const regMaskTP intArgMasks[MAX_REG_ARG];
+
+#define RBM_ARG_0                RBM_R0
+#define RBM_ARG_1                RBM_R1
+#define RBM_ARG_2                RBM_R2
+#define RBM_ARG_3                RBM_R3
+
+#define RBM_ARG_REGS            (RBM_ARG_0|RBM_ARG_1|RBM_ARG_2|RBM_ARG_3)
+#define RBM_FLTARG_REGS         (RBM_F0|RBM_F1|RBM_F2|RBM_F3|RBM_F4|RBM_F5|RBM_F6|RBM_F7|RBM_F8|RBM_F9|RBM_F10|RBM_F11|RBM_F12|RBM_F13|RBM_F14|RBM_F15)
+#define RBM_DBL_REGS            RBM_ALLDOUBLE
+
+extern const regNumber fltArgRegs[MAX_FLOAT_REG_ARG];
+extern const regMaskTP fltArgMasks[MAX_FLOAT_REG_ARG];
+
+#define LBL_DIST_SMALL_MAX_NEG  (0)
+#define LBL_DIST_SMALL_MAX_POS  (+1020)
+#define LBL_DIST_MED_MAX_NEG    (-4095)
+#define LBL_DIST_MED_MAX_POS    (+4096)
+
+#define JMP_DIST_SMALL_MAX_NEG  (-2048)
+#define JMP_DIST_SMALL_MAX_POS  (+2046)
+
+#define CALL_DIST_MAX_NEG (-16777216)
+#define CALL_DIST_MAX_POS (+16777214)
+
+#define JCC_DIST_SMALL_MAX_NEG  (-256)
+#define JCC_DIST_SMALL_MAX_POS  (+254)
+
+#define JCC_DIST_MEDIUM_MAX_NEG (-1048576)
+#define JCC_DIST_MEDIUM_MAX_POS (+1048574)
+
+#define LBL_SIZE_SMALL          (2)
+
+#define JMP_SIZE_SMALL          (2)
+#define JMP_SIZE_LARGE          (4)
+
+#define JCC_SIZE_SMALL          (2)
+#define JCC_SIZE_MEDIUM         (4)
+#define JCC_SIZE_LARGE          (6)
+
+// The first thing in an ARM32 prolog pushes LR to the stack, so this can be 0.
+#define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0
+
+#define REG_STACK_PROBE_HELPER_ARG         REG_R4
+#define RBM_STACK_PROBE_HELPER_ARG         RBM_R4
+#define REG_STACK_PROBE_HELPER_CALL_TARGET REG_R5
+#define RBM_STACK_PROBE_HELPER_CALL_TARGET RBM_R5
+#define RBM_STACK_PROBE_HELPER_TRASH       (RBM_R5 | RBM_LR)
+
 #else
   #error Unsupported or unset target architecture
 #endif
@@ -1579,9 +2348,11 @@ typedef unsigned char   regNumberSmall;
 
 #endif // TARGET_XARCH
 
+#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64) // has no registers
 C_ASSERT(REG_FIRST == 0);
 C_ASSERT(REG_INT_FIRST < REG_INT_LAST);
 C_ASSERT(REG_FP_FIRST  < REG_FP_LAST);
+#endif
 
 // Opportunistic tail call feature converts non-tail prefixed calls into
 // tail calls where possible. It requires fast tail calling mechanism for
@@ -1654,8 +2425,12 @@ inline regMaskTP genRegMaskFloat(regNumber reg, var_types type = TYP_DOUBLE);
  */
 inline bool genIsValidReg(regNumber reg)
 {
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // infinite "registers"
+    return true;
+#else
     /* It's safest to perform an unsigned comparison in case reg is negative */
     return ((unsigned)reg < (unsigned)REG_COUNT);
+#endif
 }
 
 /*****************************************************************************
@@ -1765,7 +2540,11 @@ inline regMaskTP fullIntArgRegMask()
 //
 inline bool isValidIntArgReg(regNumber reg)
 {
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    return true;
+#else
     return (genRegMask(reg) & fullIntArgRegMask()) != 0;
+#endif
 }
 
 //-------------------------------------------------------------------------------------------
@@ -1842,6 +2621,10 @@ inline regMaskTP genRegMask(regNumber reg)
     regMaskTP result = 1 << reg;
     assert(result == regMasks[reg]);
     return result;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+    regMaskTP result = 1 << reg;
+    assert(result == regMasks[reg]);
+    return result;
 #else
     return regMasks[reg];
 #endif
@@ -1854,7 +2637,7 @@ inline regMaskTP genRegMask(regNumber reg)
 
 inline regMaskTP genRegMaskFloat(regNumber reg, var_types type /* = TYP_DOUBLE */)
 {
-#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_X86)
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
     assert(genIsValidFloatReg(reg));
     assert((unsigned)reg < ArrLen(regMasks));
     return regMasks[reg];
@@ -1983,11 +2766,12 @@ inline bool isFloatRegType(var_types type)
 
 // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined.
 #if defined(WINDOWS_AMD64_ABI)
-#if !defined(TARGET_AMD64)
+#if !defined(TARGET_AMD64) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
 #error When WINDOWS_AMD64_ABI is defined you must define TARGET_AMD64 defined as well.
 #endif
 #endif
 
+#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
 /*****************************************************************************/
 // Some sanity checks on some of the register masks
 // Stack pointer is never part of RBM_ALLINT
@@ -2000,6 +2784,7 @@ C_ASSERT((RBM_ALLINT & RBM_FPBASE) == RBM_NONE);
 C_ASSERT((RBM_INT_CALLEE_SAVED & RBM_FPBASE) == RBM_NONE);
 #endif
 /*****************************************************************************/
+#endif
 
 #ifdef TARGET_64BIT
 typedef unsigned __int64 target_size_t;
diff --git a/src/coreclr/jit/targetwasm.cpp b/src/coreclr/jit/targetwasm.cpp
new file mode 100644
index 000000000000..7f752a5364b0
--- /dev/null
+++ b/src/coreclr/jit/targetwasm.cpp
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*****************************************************************************/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+
+#include "target.h"
+
+const char*            Target::g_tgtCPUName           = "wasm";
+const Target::ArgOrder Target::g_tgtArgOrder          = ARG_ORDER_R2L;
+const Target::ArgOrder Target::g_tgtUnmanagedArgOrder = ARG_ORDER_R2L;
+
+// clang-format off
+#ifdef UNIX_AMD64_ABI
+const regNumber intArgRegs [] = { REG_EDI, REG_ESI, REG_EDX, REG_ECX, REG_R8, REG_R9 };
+const regMaskTP intArgMasks[] = { RBM_EDI, RBM_ESI, RBM_EDX, RBM_ECX, RBM_R8, RBM_R9 };
+const regNumber fltArgRegs [] = { REG_XMM0, REG_XMM1, REG_XMM2, REG_XMM3, REG_XMM4, REG_XMM5, REG_XMM6, REG_XMM7 };
+const regMaskTP fltArgMasks[] = { RBM_XMM0, RBM_XMM1, RBM_XMM2, RBM_XMM3, RBM_XMM4, RBM_XMM5, RBM_XMM6, RBM_XMM7 };
+#else // !UNIX_AMD64_ABI
+const regNumber intArgRegs [] = { REG_ECX, REG_EDX, REG_R8, REG_R9 };
+const regMaskTP intArgMasks[] = { RBM_ECX, RBM_EDX, RBM_R8, RBM_R9 };
+const regNumber fltArgRegs [] = { REG_XMM0, REG_XMM1, REG_XMM2, REG_XMM3 };
+const regMaskTP fltArgMasks[] = { RBM_XMM0, RBM_XMM1, RBM_XMM2, RBM_XMM3 };
+#endif // !UNIX_AMD64_ABI
+// clang-format on
+
+#endif // TARGET_AMD64
diff --git a/src/coreclr/jit/unwind.cpp b/src/coreclr/jit/unwind.cpp
index 7bbccd789d6c..c8db0dc94498 100644
--- a/src/coreclr/jit/unwind.cpp
+++ b/src/coreclr/jit/unwind.cpp
@@ -425,6 +425,8 @@ UNATIVE_OFFSET Compiler::unwindGetCurrentOffset(FuncInfoDsc* func)
 
 // See unwindX86.cpp
 
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO
+
 #else // TARGET*
 
 #error Unsupported or unset target architecture
diff --git a/src/coreclr/jit/unwindwasm.cpp b/src/coreclr/jit/unwindwasm.cpp
new file mode 100644
index 000000000000..6cf194c762fd
--- /dev/null
+++ b/src/coreclr/jit/unwindwasm.cpp
@@ -0,0 +1,426 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XX                                                                           XX
+XX                              UnwindInfo                                   XX
+XX                                                                           XX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+*/
+
+#include "jitpch.h"
+#ifdef _MSC_VER
+#pragma hdrstop
+#endif
+
+#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+typedef union _UNWIND_CODE {
+    struct {
+        UCHAR CodeOffset;
+        UCHAR UnwindOp : 4;
+        UCHAR OpInfo : 4;
+    };
+
+    struct {
+        UCHAR OffsetLow;
+        UCHAR UnwindOp : 4;
+        UCHAR OffsetHigh : 4;
+    } EpilogueCode;
+
+    USHORT FrameOffset;
+} UNWIND_CODE, * PUNWIND_CODE;
+typedef struct _UNWIND_INFO {
+    UCHAR Version : 3;
+    UCHAR Flags : 5;
+    UCHAR SizeOfProlog;
+    UCHAR CountOfUnwindCodes;
+    UCHAR FrameRegister : 4;
+    UCHAR FrameOffset : 4;
+    UNWIND_CODE UnwindCode[1];
+
+    //
+    // The unwind codes are followed by an optional DWORD aligned field that
+    // contains the exception handler address or the address of chained unwind
+    // information. If an exception handler address is specified, then it is
+    // followed by the language specified exception handler data.
+    //
+    //  union {
+    //      ULONG ExceptionHandler;
+    //      ULONG FunctionEntry;
+    //  };
+    //
+    //  ULONG ExceptionData[];
+    //
+
+} UNWIND_INFO, * PUNWIND_INFO;
+
+#ifdef UNIX_AMD64_ABI
+short Compiler::mapRegNumToDwarfReg(regNumber reg)
+{
+    short dwarfReg = DWARF_REG_ILLEGAL;
+
+    switch (reg)
+    {
+        case REG_RAX:
+            dwarfReg = 0;
+            break;
+        case REG_RCX:
+            dwarfReg = 2;
+            break;
+        case REG_RDX:
+            dwarfReg = 1;
+            break;
+        case REG_RBX:
+            dwarfReg = 3;
+            break;
+        case REG_RSP:
+            dwarfReg = 7;
+            break;
+        case REG_RBP:
+            dwarfReg = 6;
+            break;
+        case REG_RSI:
+            dwarfReg = 4;
+            break;
+        case REG_RDI:
+            dwarfReg = 5;
+            break;
+        case REG_R8:
+            dwarfReg = 8;
+            break;
+        case REG_R9:
+            dwarfReg = 9;
+            break;
+        case REG_R10:
+            dwarfReg = 10;
+            break;
+        case REG_R11:
+            dwarfReg = 11;
+            break;
+        case REG_R12:
+            dwarfReg = 12;
+            break;
+        case REG_R13:
+            dwarfReg = 13;
+            break;
+        case REG_R14:
+            dwarfReg = 14;
+            break;
+        case REG_R15:
+            dwarfReg = 15;
+            break;
+        case REG_XMM0:
+            dwarfReg = 17;
+            break;
+        case REG_XMM1:
+            dwarfReg = 18;
+            break;
+        case REG_XMM2:
+            dwarfReg = 19;
+            break;
+        case REG_XMM3:
+            dwarfReg = 20;
+            break;
+        case REG_XMM4:
+            dwarfReg = 21;
+            break;
+        case REG_XMM5:
+            dwarfReg = 22;
+            break;
+        case REG_XMM6:
+            dwarfReg = 23;
+            break;
+        case REG_XMM7:
+            dwarfReg = 24;
+            break;
+        case REG_XMM8:
+            dwarfReg = 25;
+            break;
+        case REG_XMM9:
+            dwarfReg = 26;
+            break;
+        case REG_XMM10:
+            dwarfReg = 27;
+            break;
+        case REG_XMM11:
+            dwarfReg = 28;
+            break;
+        case REG_XMM12:
+            dwarfReg = 29;
+            break;
+        case REG_XMM13:
+            dwarfReg = 30;
+            break;
+        case REG_XMM14:
+            dwarfReg = 31;
+            break;
+        case REG_XMM15:
+            dwarfReg = 32;
+            break;
+        default:
+            noway_assert(!"unexpected REG_NUM");
+    }
+
+    return dwarfReg;
+}
+
+#endif // UNIX_AMD64_ABI
+
+//------------------------------------------------------------------------
+// Compiler::unwindBegProlog: Initialize the unwind info data structures.
+// Called at the beginning of main function or funclet prolog generation.
+//
+void Compiler::unwindBegProlog()
+{
+#ifdef UNIX_AMD64_ABI
+    if (generateCFIUnwindCodes())
+    {
+        unwindBegPrologCFI();
+    }
+    else
+#endif // UNIX_AMD64_ABI
+    {
+        unwindBegPrologWindows();
+    }
+}
+
+void Compiler::unwindBegPrologWindows()
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindEndProlog: Called at the end of main function or funclet
+// prolog generation to indicate there is no more unwind information for this prolog.
+//
+void Compiler::unwindEndProlog()
+{
+    assert(compGeneratingProlog);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindBegEpilog: Called at the beginning of main function or funclet
+// epilog generation.
+//
+void Compiler::unwindBegEpilog()
+{
+    assert(compGeneratingEpilog);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindEndEpilog: Called at the end of main function or funclet
+// epilog generation.
+//
+void Compiler::unwindEndEpilog()
+{
+    assert(compGeneratingEpilog);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindPush: Record a push/save of a register.
+//
+// Arguments:
+//    reg - The register being pushed/saved.
+//
+void Compiler::unwindPush(regNumber reg)
+{
+#ifdef UNIX_AMD64_ABI
+    if (generateCFIUnwindCodes())
+    {
+        unwindPushPopCFI(reg);
+    }
+    else
+#endif // UNIX_AMD64_ABI
+    {
+        unwindPushWindows(reg);
+    }
+}
+
+void Compiler::unwindPushWindows(regNumber reg)
+{
+    assert(false);
+}
+
+#ifdef UNIX_AMD64_ABI
+#endif // UNIX_AMD64_ABI
+
+//------------------------------------------------------------------------
+// Compiler::unwindAllocStack: Record a stack frame allocation (sub sp, X).
+//
+// Arguments:
+//    size - The size of the stack frame allocation (the amount subtracted from the stack pointer).
+//
+void Compiler::unwindAllocStack(unsigned size)
+{
+#ifdef UNIX_AMD64_ABI
+    if (generateCFIUnwindCodes())
+    {
+        unwindAllocStackCFI(size);
+    }
+    else
+#endif // UNIX_AMD64_ABI
+    {
+        unwindAllocStackWindows(size);
+    }
+}
+
+void Compiler::unwindAllocStackWindows(unsigned size)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindSetFrameReg: Record a frame register.
+//
+// Arguments:
+//    reg    - The register being set as the frame register.
+//    offset - The offset from the current stack pointer that the frame pointer will point at.
+//
+void Compiler::unwindSetFrameReg(regNumber reg, unsigned offset)
+{
+#ifdef UNIX_AMD64_ABI
+    if (generateCFIUnwindCodes())
+    {
+        unwindSetFrameRegCFI(reg, offset);
+    }
+    else
+#endif // UNIX_AMD64_ABI
+    {
+        unwindSetFrameRegWindows(reg, offset);
+    }
+}
+
+void Compiler::unwindSetFrameRegWindows(regNumber reg, unsigned offset)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindSaveReg: Record a register save.
+//
+// Arguments:
+//    reg    - The register being saved.
+//    offset - The offset from the current stack pointer where the register is being saved.
+//
+void Compiler::unwindSaveReg(regNumber reg, unsigned offset)
+{
+#ifdef UNIX_AMD64_ABI
+    if (generateCFIUnwindCodes())
+    {
+        unwindSaveRegCFI(reg, offset);
+    }
+    else
+#endif // UNIX_AMD64_ABI
+    {
+        unwindSaveRegWindows(reg, offset);
+    }
+}
+
+void Compiler::unwindSaveRegWindows(regNumber reg, unsigned offset)
+{
+    assert(false);
+}
+
+#ifdef UNIX_AMD64_ABI
+void Compiler::unwindSaveRegCFI(regNumber reg, unsigned offset)
+{
+    assert(compGeneratingProlog);
+
+    if (RBM_CALLEE_SAVED & genRegMask(reg))
+    {
+        FuncInfoDsc* func = funCurrentFunc();
+
+        unsigned int cbProlog = unwindGetCurrentOffset(func);
+        createCfiCode(func, cbProlog, CFI_REL_OFFSET, mapRegNumToDwarfReg(reg), offset);
+    }
+}
+#endif // UNIX_AMD64_ABI
+
+#ifdef DEBUG
+
+//------------------------------------------------------------------------
+// DumpUnwindInfo: Dump the unwind data.
+//
+// Arguments:
+//    isHotCode   - true if this unwind data is for the hot section, false otherwise.
+//    startOffset - byte offset of the code start that this unwind data represents.
+//    endOffset   - byte offset of the code end   that this unwind data represents.
+//    pHeader     - pointer to the unwind data blob.
+//
+void DumpUnwindInfo(bool                     isHotCode,
+                    UNATIVE_OFFSET           startOffset,
+                    UNATIVE_OFFSET           endOffset,
+                    const UNWIND_INFO* const pHeader)
+{
+    assert(false);
+}
+
+#endif // DEBUG
+
+//------------------------------------------------------------------------
+// Compiler::unwindReserve: Ask the VM to reserve space for the unwind information
+// for the function and all its funclets. Called once, just before asking the VM
+// for memory and emitting the generated code. Calls unwindReserveFunc() to handle
+// the main function and each of the funclets, in turn.
+//
+void Compiler::unwindReserve()
+{
+    assert(!compGeneratingProlog);
+    assert(!compGeneratingEpilog);
+
+    assert(compFuncInfoCount > 0);
+    for (unsigned funcIdx = 0; funcIdx < compFuncInfoCount; funcIdx++)
+    {
+        unwindReserveFunc(funGetFunc(funcIdx));
+    }
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindReserveFunc: Reserve the unwind information from the VM for a
+// given main function or funclet.
+//
+// Arguments:
+//    func - The main function or funclet to reserve unwind info for.
+//
+void Compiler::unwindReserveFunc(FuncInfoDsc* func)
+{
+    assert(false);
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindEmit: Report all the unwind information to the VM.
+//
+// Arguments:
+//    pHotCode  - Pointer to the beginning of the memory with the function and funclet hot  code.
+//    pColdCode - Pointer to the beginning of the memory with the function and funclet cold code.
+//
+void Compiler::unwindEmit(void* pHotCode, void* pColdCode)
+{
+    assert(!compGeneratingProlog);
+    assert(!compGeneratingEpilog);
+
+    assert(compFuncInfoCount > 0);
+    for (unsigned funcIdx = 0; funcIdx < compFuncInfoCount; funcIdx++)
+    {
+        unwindEmitFunc(funGetFunc(funcIdx), pHotCode, pColdCode);
+    }
+}
+
+//------------------------------------------------------------------------
+// Compiler::unwindEmitFunc: Report the unwind information to the VM for a
+// given main function or funclet. Reports the hot section, then the cold
+// section if necessary.
+//
+// Arguments:
+//    func      - The main function or funclet to reserve unwind info for.
+//    pHotCode  - Pointer to the beginning of the memory with the function and funclet hot  code.
+//    pColdCode - Pointer to the beginning of the memory with the function and funclet cold code.
+//
+void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode)
+{
+    assert(false);
+}
+
+#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
index c973f6f63c87..36e41cf11af5 100644
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@@ -333,8 +333,16 @@ void dspRegMask(regMaskTP regMask, size_t minSiz)
                     inRegRange = true;
                     sep        = "-";
                 }
-#elif defined(TARGET_X86)
+#elif defined(TARGET_X86) 
 // No register ranges
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+                // For AMD64, create ranges for int registers R8 through R15, but not the "old" registers.
+                if (regNum >= REG_R8)
+                {
+                    regHead = regNum;
+                    inRegRange = true;
+                    sep = "-";
+                }
 #else // TARGET*
 #error Unsupported or unset target architecture
 #endif // TARGET*
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index d5c0a9755a56..bf106795dd32 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -57,6 +57,8 @@ struct FloatTraits
         unsigned bits = 0xFFC00000u;
 #elif defined(TARGET_ARMARCH)
         unsigned           bits = 0x7FC00000u;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+        unsigned           bits = 0x7FC00000u;
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -83,6 +85,8 @@ struct DoubleTraits
         unsigned long long bits = 0xFFF8000000000000ull;
 #elif defined(TARGET_ARMARCH)
         unsigned long long bits = 0x7FF8000000000000ull;
+#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+        unsigned long long bits = 0xFFF8000000000000ull;
 #else
 #error Unsupported or unset target architecture
 #endif
diff --git a/src/coreclr/jit/valuenumfuncs.h b/src/coreclr/jit/valuenumfuncs.h
index 9e191ffaa614..6c5fb3d81b65 100644
--- a/src/coreclr/jit/valuenumfuncs.h
+++ b/src/coreclr/jit/valuenumfuncs.h
@@ -180,6 +180,8 @@ ValueNumFuncDef(HWI_##isa##_##name, argCount, false, false, false)   // All of t
 
 #elif defined (TARGET_ARM)
 // No Hardware Intrinsics on ARM32
+#elif defined (TARGET_WASM32) || defined(TARGET_WASM64)
+// No Hardware Intrinsics on WebAssembly
 #else
 #error Unsupported platform
 #endif
diff --git a/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs b/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
index bc65fd69b8d5..3b706364f6a7 100644
--- a/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
+++ b/src/coreclr/tools/Common/JitInterface/JitConfigProvider.cs
@@ -131,14 +131,15 @@ public string GetStringConfigValue(string name)
 
         private static string GetTargetSpec(TargetDetails target)
         {
-            string targetOSComponent = (target.OperatingSystem == TargetOS.Windows ? "win" : "unix");
+            string targetOSComponent = (target.OperatingSystem == TargetOS.Windows ? "win" : (target.OperatingSystem == TargetOS.WebAssembly ? "browser" : "unix"));
             string targetArchComponent = target.Architecture switch
             {
                 TargetArchitecture.X86 => "x86",
                 TargetArchitecture.X64 => "x64",
                 TargetArchitecture.ARM => "arm",
                 TargetArchitecture.ARM64 => "arm64",
-                TargetArchitecture.Wasm32 => "x64", // "wasm32", == needs a clrjit module
+                TargetArchitecture.Wasm32 => "wasm32",
+                TargetArchitecture.Wasm64 => "wasm64",
                 _ => throw new NotImplementedException(target.Architecture.ToString())
             };
 
diff --git a/src/coreclr/tools/Common/TypeSystem/Common/TargetArchitecture.cs b/src/coreclr/tools/Common/TypeSystem/Common/TargetArchitecture.cs
index b3b587d1432f..f942cf695362 100644
--- a/src/coreclr/tools/Common/TypeSystem/Common/TargetArchitecture.cs
+++ b/src/coreclr/tools/Common/TypeSystem/Common/TargetArchitecture.cs
@@ -17,5 +17,6 @@ public enum TargetArchitecture
         X64,
         X86,
         Wasm32,
+        Wasm64
     }
 }
diff --git a/src/coreclr/tools/aot/ilc.sln b/src/coreclr/tools/aot/ilc.sln
index 6b8af92a4d87..5c12affa7263 100644
--- a/src/coreclr/tools/aot/ilc.sln
+++ b/src/coreclr/tools/aot/ilc.sln
@@ -18,7 +18,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ILCompiler.LLVM", "ILCompil
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "repro", "ILCompiler\repro\repro.csproj", "{CBDE0470-E0C9-4693-9A11-ACC117522F3F}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x86_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x86_x64.vcxproj", "{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_browser_wasm32_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_browser_wasm32_x64.vcxproj", "{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x64_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x64_x64.vcxproj", "{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -204,22 +206,38 @@ Global
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x64.Build.0 = Release|x64
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.ActiveCfg = Release|x86
 		{CBDE0470-E0C9-4693-9A11-ACC117522F3F}.RelWithDebInfo|x86.Build.0 = Release|x86
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|Any CPU.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.Build.0 = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x86.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|Any CPU.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.Build.0 = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x86.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|Any CPU.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.Build.0 = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x86.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Checked|Any CPU.ActiveCfg = Checked|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Checked|x64.ActiveCfg = Checked|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Checked|x64.Build.0 = Checked|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Checked|x86.ActiveCfg = Checked|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Debug|x64.ActiveCfg = Debug|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Debug|x64.Build.0 = Debug|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Debug|x86.ActiveCfg = Debug|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Release|Any CPU.ActiveCfg = Release|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Release|x64.ActiveCfg = Release|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Release|x64.Build.0 = Release|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.Release|x86.ActiveCfg = Release|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{A88F0FA2-0F52-3EC8-BBFA-7FC6F4E93B56}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Checked|Any CPU.ActiveCfg = Checked|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Checked|x64.ActiveCfg = Checked|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Checked|x64.Build.0 = Checked|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Checked|x86.ActiveCfg = Checked|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Debug|x64.ActiveCfg = Debug|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Debug|x64.Build.0 = Debug|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Debug|x86.ActiveCfg = Debug|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Release|Any CPU.ActiveCfg = Release|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Release|x64.ActiveCfg = Release|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Release|x64.Build.0 = Release|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.Release|x86.ActiveCfg = Release|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From f4827f47e349ec535df46fd768a66cda361bae22 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Feb 2021 13:05:39 -0500
Subject: [PATCH 05/44] break fix for non wasm clrjit compilation

wasm compilation will now complete, although has an issue with the dbg instruction in Array Resize
---
 src/coreclr/jit/compiler.cpp                  |  6 +++---
 src/coreclr/jit/gentree.cpp                   |  4 ++--
 src/coreclr/jit/jiteh.cpp                     |  2 +-
 src/coreclr/jit/target.h                      | 19 ++++++++++---------
 .../Common/Compiler/InstructionSetSupport.cs  |  4 ++++
 .../Compiler/LLVMCodegenCompilation.cs        |  6 ++++++
 6 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 429a0b65ca43..581dc7dddb44 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -830,7 +830,7 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
             // Arm64 Windows VarArg methods arguments will not classify HFA/HVA types, they will need to be treated
             // as if they are not HFA/HVA types.
             var_types hfaType;
-#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64)
+#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
             if (isVarArg)
             {
                 hfaType = TYP_UNDEF;
@@ -923,7 +923,7 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
             howToPassStruct = SPK_ByValue;
             useType         = TYP_STRUCT;
 
-#elif defined(TARGET_AMD64) || defined(TARGET_ARM64)
+#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined (TARGET_WASM32) || defined(TARGET_WASM64) // TODO: WASM can in theory pass any size struct as an arg.
 
             // Otherwise we pass this struct by reference to a copy
             // setup wbPassType and useType indicate that this is passed using one register (by reference to a copy)
@@ -5057,9 +5057,9 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
 #if defined(TARGET_WASM32) || defined(TARGET_WASM64)
     // TODO:after rat, but better before?
     DoLlvmPhase(this); // DoPhase?
+    return;
 #endif
 
-    return;
     // Here we do "simple lowering".  When the RyuJIT backend works for all
     // platforms, this will be part of the more general lowering phase.  For now, though, we do a separate
     // pass of "final lowering."  We must do this before (final) liveness analysis, because this creates
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index 18ced83e4da2..b80564a58b4e 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -3088,7 +3088,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // we have already found either a non-ADD op1 or a non-constant op2.
         gtWalkOp(&op1, &op2, nullptr, true);
 
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
         // For XARCH we will fold GT_ADDs in the op2 position into the addressing mode, so we call
         // gtWalkOp on both operands of the original GT_ADD.
         // This is not done for ARMARCH. Though the stated reason is that we don't try to create a
@@ -3098,7 +3098,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // into the addressing mode.
         // Walk op2 looking for non-overflow GT_ADDs of constants.
         gtWalkOp(&op2, &op1, nullptr, true);
-#endif // defined(TARGET_XARCH)
+#endif // defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
 
         // OK we are done walking the tree
         // Now assert that op1 and op2 correspond with base and idx
diff --git a/src/coreclr/jit/jiteh.cpp b/src/coreclr/jit/jiteh.cpp
index 8f511a2913d1..13f6e19df1d9 100644
--- a/src/coreclr/jit/jiteh.cpp
+++ b/src/coreclr/jit/jiteh.cpp
@@ -891,7 +891,7 @@ unsigned Compiler::ehGetCallFinallyRegionIndex(unsigned finallyIndex, bool* inTr
     assert(finallyIndex != EHblkDsc::NO_ENCLOSING_INDEX);
     assert(ehGetDsc(finallyIndex)->HasFinallyHandler());
 
-#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
     return ehGetDsc(finallyIndex)->ebdGetEnclosingRegionIndex(inTryRegion);
 #else
     *inTryRegion = true;
diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h
index 10df08e0b673..5e04e6a1b826 100644
--- a/src/coreclr/jit/target.h
+++ b/src/coreclr/jit/target.h
@@ -1710,7 +1710,8 @@ typedef unsigned char   regNumberSmall;
 #define REG_ETW_FRAMED_EBP_COUNT  1
 #endif // !ETW_EBP_FRAMED
 
-#ifdef UNIX_AMD64_ABI
+//#ifdef UNIX_AMD64_ABI
+// WASM  take these as they seem more liberal
 #define MIN_ARG_AREA_FOR_CALL   0       // Minimum required outgoing argument space for a call.
 
 #define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15)
@@ -1725,14 +1726,14 @@ typedef unsigned char   regNumberSmall;
 
 #define REG_DEFAULT_PROFILER_CALL_TARGET REG_R11
 
-#else // !UNIX_AMD64_ABI
-#define MIN_ARG_AREA_FOR_CALL     (4 * REGSIZE_BYTES)       // Minimum required outgoing argument space for a call.
-
-#define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15)
-#define RBM_INT_CALLEE_TRASH    (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11)
-#define RBM_FLT_CALLEE_SAVED    (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15)
-#define RBM_FLT_CALLEE_TRASH    (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5)
-#endif // !UNIX_AMD64_ABI
+//#else // !UNIX_AMD64_ABI
+//#define MIN_ARG_AREA_FOR_CALL     (4 * REGSIZE_BYTES)       // Minimum required outgoing argument space for a call.
+//
+//#define RBM_INT_CALLEE_SAVED    (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15)
+//#define RBM_INT_CALLEE_TRASH    (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11)
+//#define RBM_FLT_CALLEE_SAVED    (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15)
+//#define RBM_FLT_CALLEE_TRASH    (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5)
+//#endif // !UNIX_AMD64_ABI
 
 #define REG_FLT_CALLEE_SAVED_FIRST   REG_XMM6
 #define REG_FLT_CALLEE_SAVED_LAST    REG_XMM15
diff --git a/src/coreclr/tools/Common/Compiler/InstructionSetSupport.cs b/src/coreclr/tools/Common/Compiler/InstructionSetSupport.cs
index 5a8e04f0d4a3..2e7b60f2978b 100644
--- a/src/coreclr/tools/Common/Compiler/InstructionSetSupport.cs
+++ b/src/coreclr/tools/Common/Compiler/InstructionSetSupport.cs
@@ -81,6 +81,10 @@ public static string GetHardwareIntrinsicId(TargetArchitecture architecture, Typ
                 if (potentialType.Namespace != "System.Runtime.Intrinsics.Arm")
                     return "";
             }
+            else if (architecture == TargetArchitecture.Wasm32 || architecture == TargetArchitecture.Wasm64)
+            {
+                return ""; // No "hardware" for Wasm at all.
+            }
             else
             {
                 throw new InternalCompilerErrorException("Unknown architecture");
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index eacf12cf8829..d8205ad81590 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System;
 using System.Collections.Generic;
 using System.Runtime.CompilerServices;
 using System.Threading;
@@ -61,6 +62,7 @@ protected override void CompileInternal(string outputFile, ObjectDumper dumper)
 
             var nodes = _dependencyGraph.MarkedNodeList;
 
+            Console.WriteLine($"RyuJIT compilation results, total method {totalMethodCount} RyuJit Methods {ryuJitMethodCount} % {ryuJitMethodCount * 100 / totalMethodCount}");
             LLVMObjectWriter.EmitObject(outputFile, nodes, NodeFactory, this, dumper);
         }
 
@@ -105,6 +107,8 @@ private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
             }
         }
 
+        static int totalMethodCount;
+        static int ryuJitMethodCount;
         private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodCodeNodeNeedingCode)
         {
             MethodDesc method = methodCodeNodeNeedingCode.Method;
@@ -112,6 +116,7 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
             try
             {
                 corInfo.CompileMethod(methodCodeNodeNeedingCode);
+                ryuJitMethodCount++;
             }
             catch (CodeGenerationFailedException)
             {
@@ -132,6 +137,7 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
             }
             finally
             {
+                totalMethodCount++;
                 // if (_compilationCountdown != null)
                 //     _compilationCountdown.Signal();
             }

From 102575040c54f2d9470b5e56998c19e764ace26a Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Feb 2021 18:40:00 -0500
Subject: [PATCH 06/44] revert project outputpath

---
 src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj b/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
index cc75a03b8343..f110ea5d4c88 100644
--- a/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
+++ b/src/coreclr/tools/aot/ILCompiler/ILCompiler.csproj
@@ -9,7 +9,7 @@
     <PlatformTarget>AnyCPU</PlatformTarget>
     <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
     <AppendTargetFrameworkToOutputPath Condition="'$(BuildingInsideVisualStudio)' == 'true'">true</AppendTargetFrameworkToOutputPath>
-    <OutputPath>E:\GitHub\runtimelab\artifacts\obj\coreclr\windows.x64.Debug\jit\Debug\</OutputPath>
+    <OutputPath>$(RuntimeBinDir)ilc</OutputPath>
     <GenerateRuntimeConfigurationFiles>true</GenerateRuntimeConfigurationFiles>
     <EnableDefaultEmbeddedResourceItems>false</EnableDefaultEmbeddedResourceItems>
     <RuntimeIdentifiers>linux-x64;win-x64;osx-x64</RuntimeIdentifiers>
@@ -59,7 +59,8 @@
       <Pack>false</Pack>
     </Content>
 
-    <Content Condition="!Exists('$(ObjWriterArtifactPath)')" Include="$(NuGetPackageRoot)runtime.$(ILCompilerRuntimeIdentifier).microsoft.dotnet.ilcompiler\$(ILCompilerVersion)\tools\$(LibPrefix)objwriter$(LibSuffix)">
+    <Content Condition="!Exists('$(ObjWriterArtifactPath)')"
+      Include="$(NuGetPackageRoot)runtime.$(ILCompilerRuntimeIdentifier).microsoft.dotnet.ilcompiler\$(ILCompilerVersion)\tools\$(LibPrefix)objwriter$(LibSuffix)">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
       <Pack>false</Pack>
@@ -134,6 +135,10 @@
        On Linux renaming the library makes it difficult to debug it. -->
   <ItemGroup Condition="'$(TargetsWindows)' == 'true'">
     <Content Remove="$(RuntimeBinDir)$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)" />
-    <Content Include="$(RuntimeBinDir)$(LibPrefix)clrjit$(LibSuffix)" CopyToOutputDirectory="PreserveNewest" CopyToPublishDirectory="PreserveNewest" Link="$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)" />
+    <Content Include="$(RuntimeBinDir)$(LibPrefix)clrjit$(LibSuffix)"
+      CopyToOutputDirectory="PreserveNewest"
+      CopyToPublishDirectory="PreserveNewest"
+      Link="$(LibPrefix)clrjit_$(TargetSpec)$(LibSuffix)"
+      />
   </ItemGroup>
 </Project>

From 0fc5ce19774c265b66305eabe701c803b3dae46c Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Feb 2021 21:28:42 -0500
Subject: [PATCH 07/44] Remove lowering/emit/lrsa/codegen by #ifdefs

Some code alerts false where not sure what to do.

Passes non-wasm tests locally.
---
 src/coreclr/jit/CMakeLists.txt       |   11 +-
 src/coreclr/jit/codegen.h            |    7 +-
 src/coreclr/jit/codegencommon.cpp    |   31 +-
 src/coreclr/jit/codegeninterface.h   |    6 +-
 src/coreclr/jit/codegenlinear.cpp    |    2 +
 src/coreclr/jit/codegenwasm.cpp      | 6236 ----------------------
 src/coreclr/jit/compiler.cpp         |  100 +-
 src/coreclr/jit/compiler.h           |   51 +-
 src/coreclr/jit/compiler.hpp         |    2 +
 src/coreclr/jit/ee_il_dll.cpp        |    6 +
 src/coreclr/jit/emit.cpp             |   14 +-
 src/coreclr/jit/emit.h               |   88 +-
 src/coreclr/jit/emitdef.h            |    2 -
 src/coreclr/jit/emitfmts.h           |    3 +-
 src/coreclr/jit/emitfmtswasm.h       |  218 -
 src/coreclr/jit/emitinl.h            |   85 +-
 src/coreclr/jit/emitjmps.h           |   23 +-
 src/coreclr/jit/emitwasm.cpp         | 7217 --------------------------
 src/coreclr/jit/emitwasm.h           |  573 --
 src/coreclr/jit/emitxarch.cpp        |    2 +-
 src/coreclr/jit/fgopt.cpp            |    4 +
 src/coreclr/jit/flowgraph.cpp        |    2 +
 src/coreclr/jit/gcencode.cpp         |    3 +-
 src/coreclr/jit/gcinfo.cpp           |    2 +
 src/coreclr/jit/gentree.cpp          |  474 +-
 src/coreclr/jit/instr.cpp            |   20 +-
 src/coreclr/jit/jiteh.cpp            |    8 +-
 src/coreclr/jit/lclvars.cpp          |   10 +-
 src/coreclr/jit/liveness.cpp         |    8 +
 src/coreclr/jit/lower.cpp            |    4 +-
 src/coreclr/jit/lower.h              |    7 +-
 src/coreclr/jit/lsra.cpp             |    2 +
 src/coreclr/jit/lsra.h               |   13 +-
 src/coreclr/jit/lsrabuild.cpp        |    2 +
 src/coreclr/jit/morph.cpp            |   15 +-
 src/coreclr/jit/regalloc.cpp         |    6 +
 src/coreclr/jit/regset.cpp           |    6 +
 src/coreclr/jit/scopeinfo.cpp        |    2 +
 src/coreclr/jit/stacklevelsetter.cpp |    8 +
 src/coreclr/jit/treelifeupdater.cpp  |   17 +
 src/coreclr/jit/unwind.cpp           |    6 +-
 41 files changed, 769 insertions(+), 14527 deletions(-)
 delete mode 100644 src/coreclr/jit/codegenwasm.cpp
 delete mode 100644 src/coreclr/jit/emitfmtswasm.h
 delete mode 100644 src/coreclr/jit/emitwasm.cpp
 delete mode 100644 src/coreclr/jit/emitwasm.h

diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 0cca94100c90..7fe9599e385b 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -67,9 +67,11 @@ function(create_standalone_jit)
   endif ()
 
   if (TARGETDETAILS_ARCH STREQUAL "wasm64")
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM64)
   endif ()
   if (TARGETDETAILS_ARCH STREQUAL "wasm32")
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM32)
   endif ()
 endfunction()
@@ -289,7 +291,6 @@ if (CLR_CMAKE_TARGET_WIN32)
     list (APPEND JIT_HEADERS
       emitfmtsxarch.h
       emitxarch.h
-      emitwasm.h
       hwintrinsiclistxarch.h
       hwintrinsic.h
       instrsxarch.h
@@ -315,10 +316,6 @@ set( JIT_AMD64_SOURCES
 
 # TODO this is just a copy of AMD64_SOURCES to get started - e.g. simd,hwintrinsics doesn't make sense for wasm
 set( JIT_WASM64_SOURCES
-  codegenwasm.cpp
-  emitwasm.cpp
-  lowerwasm.cpp
-  lsrawasm.cpp
   simd.cpp
   simdashwintrinsic.cpp
   simdcodegenxarch.cpp
@@ -328,10 +325,6 @@ set( JIT_WASM64_SOURCES
   hwintrinsiccodegenxarch.cpp
 )
 set( JIT_WASM32_SOURCES
-  codegenwasm.cpp
-  emitwasm.cpp
-  lowerwasm.cpp
-  lsrawasm.cpp
   simd.cpp
   simdashwintrinsic.cpp
   simdcodegenxarch.cpp
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index cf9551454d60..d6d1610b3dc7 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -6,7 +6,7 @@
 // of a method, except for the target-specific elements, which are
 // primarily in the Target class.
 //
-
+#ifndef TARGET_WASM
 #ifndef _CODEGEN_H_
 #define _CODEGEN_H_
 #include "codegeninterface.h"
@@ -50,7 +50,7 @@ class CodeGen final : public CodeGenInterface
                                    ssize_t* cnsPtr);
 
 private:
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
     // Bit masks used in negating a float or double number.
     // This is to avoid creating more than one data constant for these bitmasks when a
     // method has more than one GT_NEG operation on floating point values.
@@ -422,7 +422,7 @@ class CodeGen final : public CodeGenInterface
 
 #endif // TARGET_AMD64
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
 
     // Save/Restore callee saved float regs to stack
     void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
@@ -1615,3 +1615,4 @@ inline void DoPhase(CodeGen* _codeGen, Phases _phase, void (CodeGen::*_action)()
 }
 
 #endif // _CODEGEN_H_
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 3e3f56a1f19d..7ac9c80d2b5c 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -13,7 +13,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
 // TODO-Cleanup: There are additional methods in CodeGen*.cpp that are almost
 // identical, and which should probably be moved here.
-
+#ifndef TARGET_WASM
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -30,6 +30,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "patchpointinfo.h"
 
 /*****************************************************************************/
+#endif //!TARGET_WASM
 
 const BYTE genTypeSizes[] = {
 #define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) sz,
@@ -54,6 +55,7 @@ const BYTE genActualTypes[] = {
 #include "typelist.h"
 #undef DEF_TP
 };
+#ifndef TARGET_WASM
 
 void CodeGenInterface::setFramePointerRequiredEH(bool value)
 {
@@ -820,7 +822,7 @@ TempDsc* CodeGenInterface::getSpillTempDsc(GenTree* tree)
     return temp;
 }
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#ifdef TARGET_XARCH
 
 #ifdef TARGET_AMD64
 // Returns relocation type hint for an addr.
@@ -1926,7 +1928,7 @@ void CodeGen::genCheckOverflow(GenTree* tree)
     {
         bool isUnsignedOverflow = ((tree->gtFlags & GTF_UNSIGNED) != 0);
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
 
         jumpKind = isUnsignedOverflow ? EJ_jb : EJ_jo;
 
@@ -5663,10 +5665,6 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value
                 GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-                // XORPS is the fastest and smallest way to initialize a XMM register to zero.
-                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
-                dblInitReg = reg;
 #else // TARGET*
 #error Unsupported or unset target architecture
 #endif
@@ -5702,10 +5700,6 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value
                 GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, reg, 0x00, INS_OPTS_16B);
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-                // XORPS is the fastest and smallest way to initialize a XMM register to zero.
-                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
-                fltInitReg = reg;
 #else // TARGET*
 #error Unsupported or unset target architecture
 #endif
@@ -6015,7 +6009,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog)
     }
 }
 
-#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_XARCH)
 
 void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog)
 {
@@ -6336,7 +6330,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
 #endif // TARGET_ARM64
         noway_assert(uCntBytes == 0);
 
-#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#elif defined(TARGET_XARCH)
         assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
         emitter*  emit        = GetEmitter();
         regNumber frameReg    = genFramePointerReg();
@@ -6348,7 +6342,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         noway_assert((blkSize % sizeof(int)) == 0);
         // initReg is not a live incoming argument reg
         assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0);
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_AMD64)
         // We will align on x64 so can use the aligned mov
         instruction simdMov = simdAlignedMovIns();
         // Aligning low we want to move up to next boundary
@@ -6374,7 +6368,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
             {
                 emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i);
             }
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_AMD64)
             assert((i == blkSize) || (i + (int)sizeof(int) == blkSize));
             if (i != blkSize)
             {
@@ -6396,7 +6390,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
             regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4);
 #endif // UNIX_AMD64_ABI
 
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_AMD64)
             int       alignedLclHi;
             int       alignmentHiBlkSize;
 
@@ -8505,7 +8499,7 @@ void CodeGen::genFnEpilog(BasicBlock* block)
     compiler->unwindEndEpilog();
 }
 
-#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#elif defined(TARGET_XARCH)
 
 void CodeGen::genFnEpilog(BasicBlock* block)
 {
@@ -9880,7 +9874,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
 // Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
 // down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
 // Here offset = 16-byte aligned offset after pushing integer registers.
@@ -12936,3 +12930,4 @@ void CodeGenInterface::VariableLiveKeeper::dumpLvaVariableLiveRanges() const
 }
 #endif // DEBUG
 #endif // USING_VARIABLE_LIVE_RANGE
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h
index e5029fbfa4c5..87fa757cacaa 100644
--- a/src/coreclr/jit/codegeninterface.h
+++ b/src/coreclr/jit/codegeninterface.h
@@ -17,6 +17,7 @@
 // accessed from members of Compiler.
 //
 
+#ifndef TARGET_WASM
 #ifndef _CODEGEN_INTERFACE_H_
 #define _CODEGEN_INTERFACE_H_
 
@@ -116,8 +117,6 @@ class CodeGenInterface
     static const insFlags instInfo[INS_count];
 #elif defined(TARGET_ARM) || defined(TARGET_ARM64)
     static const BYTE instInfo[INS_count];
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-    static const insFlags instInfo[INS_count];
 #else
 #error Unsupported target architecture
 #endif
@@ -202,7 +201,7 @@ class CodeGenInterface
 
     regNumber genGetThisArgReg(GenTreeCall* call) const;
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#ifdef TARGET_XARCH
 #ifdef TARGET_AMD64
     // There are no reloc hints on x86
     unsigned short genAddrRelocTypeHint(size_t addr);
@@ -777,3 +776,4 @@ class CodeGenInterface
 };
 
 #endif // _CODEGEN_INTERFACE_H_
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 215e3c04f75b..dc86ff503587 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -9,6 +9,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
+#ifndef TARGET_WASM
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -2648,3 +2649,4 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
     inst_SETCC(setcc->gtCondition, setcc->TypeGet(), setcc->GetRegNum());
     genProduceReg(setcc);
 }
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/codegenwasm.cpp b/src/coreclr/jit/codegenwasm.cpp
deleted file mode 100644
index 631277f36f94..000000000000
--- a/src/coreclr/jit/codegenwasm.cpp
+++ /dev/null
@@ -1,6236 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XX                                                                           XX
-XX                        Amd64/x86 Code Generator                           XX
-XX                                                                           XX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-*/
-#include "jitpch.h"
-#ifdef _MSC_VER
-#pragma hdrstop
-#pragma warning(disable : 4310) // cast truncates constant value - happens for (int8_t)0xb1
-#endif
-
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
-#include "emit.h"
-#include "codegen.h"
-#include "lower.h"
-#include "gcinfo.h"
-#include "gcinfoencoder.h"
-#include "patchpointinfo.h"
-
-/*****************************************************************************
- *
- *  Generate code that will set the given register to the integer constant.
- */
-
-void CodeGen::genSetRegToIcon(regNumber reg, ssize_t val, var_types type, insFlags flags)
-{
-    // Reg cannot be a FP reg
-    assert(!genIsValidFloatReg(reg));
-
-    // The only TYP_REF constant that can come this path is a managed 'null' since it is not
-    // relocatable.  Other ref type constants (e.g. string objects) go through a different
-    // code path.
-    noway_assert(type != TYP_REF || val == 0);
-
-    if (val == 0)
-    {
-        instGen_Set_Reg_To_Zero(emitActualTypeSize(type), reg, flags);
-    }
-    else
-    {
-        // TODO-XArch-CQ: needs all the optimized cases
-        GetEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(type), reg, val);
-    }
-}
-
-//---------------------------------------------------------------------
-// genSetGSSecurityCookie: Set the "GS" security cookie in the prolog.
-//
-// Arguments:
-//     initReg        - register to use as a scratch register
-//     pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
-//                      this call sets 'initReg' to a non-zero value.
-//
-// Return Value:
-//     None
-//
-void CodeGen::genSetGSSecurityCookie(regNumber initReg, bool* pInitRegZeroed)
-{
-    assert(compiler->compGeneratingProlog);
-
-    if (!compiler->getNeedsGSSecurityCookie())
-    {
-        return;
-    }
-
-    if (compiler->opts.IsOSR() && compiler->info.compPatchpointInfo->HasSecurityCookie())
-    {
-        // Security cookie is on original frame and was initialized there.
-        return;
-    }
-
-    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
-    {
-        noway_assert(compiler->gsGlobalSecurityCookieVal != 0);
-#ifdef TARGET_AMD64
-        if ((int)compiler->gsGlobalSecurityCookieVal != compiler->gsGlobalSecurityCookieVal)
-        {
-            // initReg = #GlobalSecurityCookieVal64; [frame.GSSecurityCookie] = initReg
-            genSetRegToIcon(initReg, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
-            GetEmitter()->emitIns_S_R(INS_mov, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0);
-            *pInitRegZeroed = false;
-        }
-        else
-#endif
-        {
-            // mov   dword ptr [frame.GSSecurityCookie], #GlobalSecurityCookieVal
-            GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
-                                      (int)compiler->gsGlobalSecurityCookieVal);
-        }
-    }
-    else
-    {
-        // Always use EAX on x86 and x64
-        // On x64, if we're not moving into RAX, and the address isn't RIP relative, we can't encode it.
-        //  mov   eax, dword ptr [compiler->gsGlobalSecurityCookieAddr]
-        //  mov   dword ptr [frame.GSSecurityCookie], eax
-        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_EAX, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
-        regSet.verifyRegUsed(REG_EAX);
-        GetEmitter()->emitIns_S_R(INS_mov, EA_PTRSIZE, REG_EAX, compiler->lvaGSSecurityCookie, 0);
-        if (initReg == REG_EAX)
-        {
-            *pInitRegZeroed = false;
-        }
-    }
-}
-
-/*****************************************************************************
- *
- *   Generate code to check that the GS cookie wasn't thrashed by a buffer
- *   overrun.  If pushReg is true, preserve all registers around code sequence.
- *   Otherwise ECX could be modified.
- *
- *   Implementation Note: pushReg = true, in case of tail calls.
- */
-void CodeGen::genEmitGSCookieCheck(bool pushReg)
-{
-    noway_assert(compiler->gsGlobalSecurityCookieAddr || compiler->gsGlobalSecurityCookieVal);
-
-    // Make sure that EAX is reported as live GC-ref so that any GC that kicks in while
-    // executing GS cookie check will not collect the object pointed to by EAX.
-    //
-    // For Amd64 System V, a two-register-returned struct could be returned in RAX and RDX
-    // In such case make sure that the correct GC-ness of RDX is reported as well, so
-    // a GC object pointed by RDX will not be collected.
-    if (!pushReg)
-    {
-        // Handle multi-reg return type values
-        if (compiler->compMethodReturnsMultiRegRetType())
-        {
-            ReturnTypeDesc retTypeDesc;
-            if (varTypeIsLong(compiler->info.compRetNativeType))
-            {
-                retTypeDesc.InitializeLongReturnType();
-            }
-            else // we must have a struct return type
-            {
-                retTypeDesc.InitializeStructReturnType(compiler, compiler->info.compMethodInfo->args.retTypeClass,
-                                                       compiler->info.compCallConv);
-            }
-
-            const unsigned regCount = retTypeDesc.GetReturnRegCount();
-
-            // Only x86 and x64 Unix ABI allows multi-reg return and
-            // number of result regs should be equal to MAX_RET_REG_COUNT.
-            assert(regCount == MAX_RET_REG_COUNT);
-
-            for (unsigned i = 0; i < regCount; ++i)
-            {
-                gcInfo.gcMarkRegPtrVal(retTypeDesc.GetABIReturnReg(i), retTypeDesc.GetReturnRegType(i));
-            }
-        }
-        else if (compiler->compMethodReturnsRetBufAddr())
-        {
-            // This is for returning in an implicit RetBuf.
-            // If the address of the buffer is returned in REG_INTRET, mark the content of INTRET as ByRef.
-
-            // In case the return is in an implicit RetBuf, the native return type should be a struct
-            assert(varTypeIsStruct(compiler->info.compRetNativeType));
-
-            gcInfo.gcMarkRegPtrVal(REG_INTRET, TYP_BYREF);
-        }
-        // ... all other cases.
-        else
-        {
-#ifdef TARGET_AMD64
-            // For x64, structs that are not returned in registers are always
-            // returned in implicit RetBuf. If we reached here, we should not have
-            // a RetBuf and the return type should not be a struct.
-            assert(compiler->info.compRetBuffArg == BAD_VAR_NUM);
-            assert(!varTypeIsStruct(compiler->info.compRetNativeType));
-#endif // TARGET_AMD64
-
-            // For x86 Windows we can't make such assertions since we generate code for returning of
-            // the RetBuf in REG_INTRET only when the ProfilerHook is enabled. Otherwise
-            // compRetNativeType could be TYP_STRUCT.
-            gcInfo.gcMarkRegPtrVal(REG_INTRET, compiler->info.compRetNativeType);
-        }
-    }
-
-    regNumber regGSCheck;
-    regMaskTP regMaskGSCheck = RBM_NONE;
-
-    if (!pushReg)
-    {
-        // Non-tail call: we can use any callee trash register that is not
-        // a return register or contain 'this' pointer (keep alive this), since
-        // we are generating GS cookie check after a GT_RETURN block.
-        // Note: On Amd64 System V RDX is an arg register - REG_ARG_2 - as well
-        // as return register for two-register-returned structs.
-        if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvRegister &&
-            (compiler->lvaTable[compiler->info.compThisArg].GetRegNum() == REG_ARG_0))
-        {
-            regGSCheck = REG_ARG_1;
-        }
-        else
-        {
-            regGSCheck = REG_ARG_0;
-        }
-    }
-    else
-    {
-#ifdef TARGET_X86
-        // It doesn't matter which register we pick, since we're going to save and restore it
-        // around the check.
-        // TODO-CQ: Can we optimize the choice of register to avoid doing the push/pop sometimes?
-        regGSCheck     = REG_EAX;
-        regMaskGSCheck = RBM_EAX;
-#else  // !TARGET_X86
-        // Jmp calls: specify method handle using which JIT queries VM for its entry point
-        // address and hence it can neither be a VSD call nor PInvoke calli with cookie
-        // parameter.  Therefore, in case of jmp calls it is safe to use R11.
-        regGSCheck = REG_R11;
-#endif // !TARGET_X86
-    }
-
-    regMaskTP byrefPushedRegs = RBM_NONE;
-    regMaskTP norefPushedRegs = RBM_NONE;
-    regMaskTP pushedRegs      = RBM_NONE;
-
-    if (compiler->gsGlobalSecurityCookieAddr == nullptr)
-    {
-#if defined(TARGET_AMD64)
-        // If GS cookie value fits within 32-bits we can use 'cmp mem64, imm32'.
-        // Otherwise, load the value into a reg and use 'cmp mem64, reg64'.
-        if ((int)compiler->gsGlobalSecurityCookieVal != (ssize_t)compiler->gsGlobalSecurityCookieVal)
-        {
-            genSetRegToIcon(regGSCheck, compiler->gsGlobalSecurityCookieVal, TYP_I_IMPL);
-            GetEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
-        }
-        else
-#endif // defined(TARGET_AMD64)
-        {
-            assert((int)compiler->gsGlobalSecurityCookieVal == (ssize_t)compiler->gsGlobalSecurityCookieVal);
-            GetEmitter()->emitIns_S_I(INS_cmp, EA_PTRSIZE, compiler->lvaGSSecurityCookie, 0,
-                                      (int)compiler->gsGlobalSecurityCookieVal);
-        }
-    }
-    else
-    {
-        // Ngen case - GS cookie value needs to be accessed through an indirection.
-
-        pushedRegs = genPushRegs(regMaskGSCheck, &byrefPushedRegs, &norefPushedRegs);
-
-        instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, regGSCheck, (ssize_t)compiler->gsGlobalSecurityCookieAddr);
-        GetEmitter()->emitIns_R_AR(ins_Load(TYP_I_IMPL), EA_PTRSIZE, regGSCheck, regGSCheck, 0);
-        GetEmitter()->emitIns_S_R(INS_cmp, EA_PTRSIZE, regGSCheck, compiler->lvaGSSecurityCookie, 0);
-    }
-
-    BasicBlock* gsCheckBlk = genCreateTempLabel();
-    inst_JMP(EJ_je, gsCheckBlk);
-    genEmitHelperCall(CORINFO_HELP_FAIL_FAST, 0, EA_UNKNOWN);
-    genDefineTempLabel(gsCheckBlk);
-
-    genPopRegs(pushedRegs, byrefPushedRegs, norefPushedRegs);
-}
-
-BasicBlock* CodeGen::genCallFinally(BasicBlock* block)
-{
-#if defined(FEATURE_EH_FUNCLETS)
-    // Generate a call to the finally, like this:
-    //      mov         rcx,qword ptr [rbp + 20H]       // Load rcx with PSPSym
-    //      call        finally-funclet
-    //      jmp         finally-return                  // Only for non-retless finally calls
-    // The jmp can be a NOP if we're going to the next block.
-    // If we're generating code for the main function (not a funclet), and there is no localloc,
-    // then RSP at this point is the same value as that stored in the PSPSym. So just copy RSP
-    // instead of loading the PSPSym in this case, or if PSPSym is not used (CoreRT ABI).
-
-    if ((compiler->lvaPSPSym == BAD_VAR_NUM) ||
-        (!compiler->compLocallocUsed && (compiler->funCurrentFunc()->funKind == FUNC_ROOT)))
-    {
-#ifndef UNIX_X86_ABI
-        inst_RV_RV(INS_mov, REG_ARG_0, REG_SPBASE, TYP_I_IMPL);
-#endif // !UNIX_X86_ABI
-    }
-    else
-    {
-        GetEmitter()->emitIns_R_S(ins_Load(TYP_I_IMPL), EA_PTRSIZE, REG_ARG_0, compiler->lvaPSPSym, 0);
-    }
-    GetEmitter()->emitIns_J(INS_call, block->bbJumpDest);
-
-    if (block->bbFlags & BBF_RETLESS_CALL)
-    {
-        // We have a retless call, and the last instruction generated was a call.
-        // If the next block is in a different EH region (or is the end of the code
-        // block), then we need to generate a breakpoint here (since it will never
-        // get executed) to get proper unwind behavior.
-
-        if ((block->bbNext == nullptr) || !BasicBlock::sameEHRegion(block, block->bbNext))
-        {
-            instGen(INS_BREAKPOINT); // This should never get executed
-        }
-    }
-    else
-    {
-// TODO-Linux-x86: Do we need to handle the GC information for this NOP or JMP specially, as is done for other
-// architectures?
-#ifndef JIT32_GCENCODER
-        // Because of the way the flowgraph is connected, the liveness info for this one instruction
-        // after the call is not (can not be) correct in cases where a variable has a last use in the
-        // handler.  So turn off GC reporting for this single instruction.
-        GetEmitter()->emitDisableGC();
-#endif // JIT32_GCENCODER
-
-        // Now go to where the finally funclet needs to return to.
-        if (block->bbNext->bbJumpDest == block->bbNext->bbNext)
-        {
-            // Fall-through.
-            // TODO-XArch-CQ: Can we get rid of this instruction, and just have the call return directly
-            // to the next instruction? This would depend on stack walking from within the finally
-            // handler working without this instruction being in this special EH region.
-            instGen(INS_nop);
-        }
-        else
-        {
-            inst_JMP(EJ_jmp, block->bbNext->bbJumpDest);
-        }
-
-#ifndef JIT32_GCENCODER
-        GetEmitter()->emitEnableGC();
-#endif // JIT32_GCENCODER
-    }
-
-#else // !FEATURE_EH_FUNCLETS
-
-    // If we are about to invoke a finally locally from a try block, we have to set the ShadowSP slot
-    // corresponding to the finally's nesting level. When invoked in response to an exception, the
-    // EE does this.
-    //
-    // We have a BBJ_CALLFINALLY followed by a BBJ_ALWAYS.
-    //
-    // We will emit :
-    //      mov [ebp - (n + 1)], 0
-    //      mov [ebp -  n     ], 0xFC
-    //      push &step
-    //      jmp  finallyBlock
-    // ...
-    // step:
-    //      mov [ebp -  n     ], 0
-    //      jmp leaveTarget
-    // ...
-    // leaveTarget:
-
-    noway_assert(isFramePointerUsed());
-
-    // Get the nesting level which contains the finally
-    unsigned finallyNesting = 0;
-    compiler->fgGetNestingLevel(block, &finallyNesting);
-
-    // The last slot is reserved for ICodeManager::FixContext(ppEndRegion)
-    unsigned filterEndOffsetSlotOffs;
-    filterEndOffsetSlotOffs = (unsigned)(compiler->lvaLclSize(compiler->lvaShadowSPslotsVar) - TARGET_POINTER_SIZE);
-
-    unsigned curNestingSlotOffs;
-    curNestingSlotOffs = (unsigned)(filterEndOffsetSlotOffs - ((finallyNesting + 1) * TARGET_POINTER_SIZE));
-
-    // Zero out the slot for the next nesting level
-    GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaShadowSPslotsVar,
-                              curNestingSlotOffs - TARGET_POINTER_SIZE, 0);
-    GetEmitter()->emitIns_S_I(INS_mov, EA_PTRSIZE, compiler->lvaShadowSPslotsVar, curNestingSlotOffs, LCL_FINALLY_MARK);
-
-    // Now push the address where the finally funclet should return to directly.
-    if (!(block->bbFlags & BBF_RETLESS_CALL))
-    {
-        assert(block->isBBCallAlwaysPair());
-        GetEmitter()->emitIns_J(INS_push_hide, block->bbNext->bbJumpDest);
-    }
-    else
-    {
-        // EE expects a DWORD, so we provide 0
-        inst_IV(INS_push_hide, 0);
-    }
-
-    // Jump to the finally BB
-    inst_JMP(EJ_jmp, block->bbJumpDest);
-
-#endif // !FEATURE_EH_FUNCLETS
-
-    // The BBJ_ALWAYS is used because the BBJ_CALLFINALLY can't point to the
-    // jump target using bbJumpDest - that is already used to point
-    // to the finally block. So just skip past the BBJ_ALWAYS unless the
-    // block is RETLESS.
-    if (!(block->bbFlags & BBF_RETLESS_CALL))
-    {
-        assert(block->isBBCallAlwaysPair());
-        block = block->bbNext;
-    }
-    return block;
-}
-
-#if defined(FEATURE_EH_FUNCLETS)
-void CodeGen::genEHCatchRet(BasicBlock* block)
-{
-    // Set RAX to the address the VM should return to after the catch.
-    // Generate a RIP-relative
-    //         lea reg, [rip + disp32] ; the RIP is implicit
-    // which will be position-independent.
-    GetEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, block->bbJumpDest, REG_INTRET);
-}
-
-#else // !FEATURE_EH_FUNCLETS
-
-void CodeGen::genEHFinallyOrFilterRet(BasicBlock* block)
-{
-    // The last statement of the block must be a GT_RETFILT, which has already been generated.
-    assert(block->lastNode() != nullptr);
-    assert(block->lastNode()->OperGet() == GT_RETFILT);
-
-    if (block->bbJumpKind == BBJ_EHFINALLYRET)
-    {
-        assert(block->lastNode()->AsOp()->gtOp1 == nullptr); // op1 == nullptr means endfinally
-
-        // Return using a pop-jmp sequence. As the "try" block calls
-        // the finally with a jmp, this leaves the x86 call-ret stack
-        // balanced in the normal flow of path.
-
-        noway_assert(isFramePointerRequired());
-        inst_RV(INS_pop_hide, REG_EAX, TYP_I_IMPL);
-        inst_RV(INS_i_jmp, REG_EAX, TYP_I_IMPL);
-    }
-    else
-    {
-        assert(block->bbJumpKind == BBJ_EHFILTERRET);
-
-        // The return value has already been computed.
-        instGen_Return(0);
-    }
-}
-
-#endif // !FEATURE_EH_FUNCLETS
-
-//  Move an immediate value into an integer register
-
-void CodeGen::instGen_Set_Reg_To_Imm(emitAttr  size,
-                                     regNumber reg,
-                                     ssize_t   imm,
-                                     insFlags flags DEBUGARG(size_t targetHandle) DEBUGARG(unsigned gtFlags))
-{
-    // reg cannot be a FP register
-    assert(!genIsValidFloatReg(reg));
-
-    if (!compiler->opts.compReloc)
-    {
-        size = EA_SIZE(size); // Strip any Reloc flags from size if we aren't doing relocs
-    }
-
-    if ((imm == 0) && !EA_IS_RELOC(size))
-    {
-        instGen_Set_Reg_To_Zero(size, reg, flags);
-    }
-    else
-    {
-        if (genDataIndirAddrCanBeEncodedAsPCRelOffset(imm))
-        {
-            emitAttr newSize = EA_PTR_DSP_RELOC;
-            if (EA_IS_BYREF(size))
-            {
-                newSize = EA_SET_FLG(newSize, EA_BYREF_FLG);
-            }
-
-            GetEmitter()->emitIns_R_AI(INS_lea, newSize, reg, imm);
-        }
-        else
-        {
-            GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm);
-        }
-    }
-    regSet.verifyRegUsed(reg);
-}
-
-/***********************************************************************************
- *
- * Generate code to set a register 'targetReg' of type 'targetType' to the constant
- * specified by the constant (GT_CNS_INT or GT_CNS_DBL) in 'tree'. This does not call
- * genProduceReg() on the target register.
- */
-void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree)
-{
-    switch (tree->gtOper)
-    {
-        case GT_CNS_INT:
-        {
-            // relocatable values tend to come down as a CNS_INT of native int type
-            // so the line between these two opcodes is kind of blurry
-            GenTreeIntConCommon* con    = tree->AsIntConCommon();
-            ssize_t              cnsVal = con->IconValue();
-
-            if (con->ImmedValNeedsReloc(compiler))
-            {
-                emitAttr size = EA_HANDLE_CNS_RELOC;
-
-                if (targetType == TYP_BYREF)
-                {
-                    size = EA_SET_FLG(size, EA_BYREF_FLG);
-                }
-
-                instGen_Set_Reg_To_Imm(size, targetReg, cnsVal);
-                regSet.verifyRegUsed(targetReg);
-            }
-            else
-            {
-                genSetRegToIcon(targetReg, cnsVal, targetType);
-            }
-        }
-        break;
-
-        case GT_CNS_DBL:
-        {
-            emitter* emit       = GetEmitter();
-            emitAttr size       = emitTypeSize(targetType);
-            double   constValue = tree->AsDblCon()->gtDconVal;
-
-            // Make sure we use "xorps reg, reg" only for +ve zero constant (0.0) and not for -ve zero (-0.0)
-            if (*(__int64*)&constValue == 0)
-            {
-                // A faster/smaller way to generate 0
-                emit->emitIns_R_R(INS_xorps, size, targetReg, targetReg);
-            }
-            else
-            {
-                CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size);
-                emit->emitIns_R_C(ins_Load(targetType), size, targetReg, hnd, 0);
-            }
-        }
-        break;
-
-        default:
-            unreached();
-    }
-}
-
-//------------------------------------------------------------------------
-// genCodeForNegNot: Produce code for a GT_NEG/GT_NOT node.
-//
-// Arguments:
-//    tree - the node
-//
-void CodeGen::genCodeForNegNot(GenTree* tree)
-{
-    assert(tree->OperIs(GT_NEG, GT_NOT));
-
-    regNumber targetReg  = tree->GetRegNum();
-    var_types targetType = tree->TypeGet();
-
-    if (varTypeIsFloating(targetType))
-    {
-        assert(tree->gtOper == GT_NEG);
-        genSSE2BitwiseOp(tree);
-    }
-    else
-    {
-        GenTree* operand = tree->gtGetOp1();
-        assert(operand->isUsedFromReg());
-        regNumber operandReg = genConsumeReg(operand);
-
-        if (operandReg != targetReg)
-        {
-            inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
-        }
-
-        instruction ins = genGetInsForOper(tree->OperGet(), targetType);
-        inst_RV(ins, targetReg, targetType);
-    }
-
-    genProduceReg(tree);
-}
-
-//------------------------------------------------------------------------
-// genCodeForBswap: Produce code for a GT_BSWAP / GT_BSWAP16 node.
-//
-// Arguments:
-//    tree - the node
-//
-void CodeGen::genCodeForBswap(GenTree* tree)
-{
-    // TODO: If we're swapping immediately after a read from memory or immediately before
-    // a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
-    // the platform supports it.
-
-    assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));
-
-    regNumber targetReg  = tree->GetRegNum();
-    var_types targetType = tree->TypeGet();
-
-    GenTree* operand = tree->gtGetOp1();
-    assert(operand->isUsedFromReg());
-    regNumber operandReg = genConsumeReg(operand);
-
-    if (operandReg != targetReg)
-    {
-        inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
-    }
-
-    if (tree->OperIs(GT_BSWAP))
-    {
-        // 32-bit and 64-bit byte swaps use "bswap reg"
-        inst_RV(INS_bswap, targetReg, targetType);
-    }
-    else
-    {
-        // 16-bit byte swaps use "ror reg.16, 8"
-        inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
-    }
-
-    genProduceReg(tree);
-}
-
-// Generate code to get the high N bits of a N*N=2N bit multiplication result
-void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
-{
-    assert(!treeNode->gtOverflowEx());
-
-    regNumber targetReg  = treeNode->GetRegNum();
-    var_types targetType = treeNode->TypeGet();
-    emitter*  emit       = GetEmitter();
-    emitAttr  size       = emitTypeSize(treeNode);
-    GenTree*  op1        = treeNode->AsOp()->gtOp1;
-    GenTree*  op2        = treeNode->AsOp()->gtOp2;
-
-    // to get the high bits of the multiply, we are constrained to using the
-    // 1-op form:  RDX:RAX = RAX * rm
-    // The 3-op form (Rx=Ry*Rz) does not support it.
-
-    genConsumeOperands(treeNode->AsOp());
-
-    GenTree* regOp = op1;
-    GenTree* rmOp  = op2;
-
-    // Set rmOp to the memory operand (if any)
-    if (op1->isUsedFromMemory() || (op2->isUsedFromReg() && (op2->GetRegNum() == REG_RAX)))
-    {
-        regOp = op2;
-        rmOp  = op1;
-    }
-    assert(regOp->isUsedFromReg());
-
-    // Setup targetReg when neither of the source operands was a matching register
-    if (regOp->GetRegNum() != REG_RAX)
-    {
-        inst_RV_RV(ins_Copy(targetType), REG_RAX, regOp->GetRegNum(), targetType);
-    }
-
-    instruction ins;
-    if ((treeNode->gtFlags & GTF_UNSIGNED) == 0)
-    {
-        ins = INS_imulEAX;
-    }
-    else
-    {
-        ins = INS_mulEAX;
-    }
-    emit->emitInsBinary(ins, size, treeNode, rmOp);
-
-    // Move the result to the desired register, if necessary
-    if (treeNode->OperGet() == GT_MULHI && targetReg != REG_RDX)
-    {
-        inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
-    }
-
-    genProduceReg(treeNode);
-}
-
-#ifdef TARGET_X86
-//------------------------------------------------------------------------
-// genCodeForLongUMod: Generate code for a tree of the form
-//                     `(umod (gt_long x y) (const int))`
-//
-// Arguments:
-//   node - the node for which to generate code
-//
-void CodeGen::genCodeForLongUMod(GenTreeOp* node)
-{
-    assert(node != nullptr);
-    assert(node->OperGet() == GT_UMOD);
-    assert(node->TypeGet() == TYP_INT);
-
-    GenTreeOp* const dividend = node->gtOp1->AsOp();
-    assert(dividend->OperGet() == GT_LONG);
-    assert(varTypeIsLong(dividend));
-
-    genConsumeOperands(node);
-
-    GenTree* const dividendLo = dividend->gtOp1;
-    GenTree* const dividendHi = dividend->gtOp2;
-    assert(dividendLo->isUsedFromReg());
-    assert(dividendHi->isUsedFromReg());
-
-    GenTree* const divisor = node->gtOp2;
-    assert(divisor->gtSkipReloadOrCopy()->OperGet() == GT_CNS_INT);
-    assert(divisor->gtSkipReloadOrCopy()->isUsedFromReg());
-    assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal >= 2);
-    assert(divisor->gtSkipReloadOrCopy()->AsIntCon()->gtIconVal <= 0x3fffffff);
-
-    // dividendLo must be in RAX; dividendHi must be in RDX
-    genCopyRegIfNeeded(dividendLo, REG_EAX);
-    genCopyRegIfNeeded(dividendHi, REG_EDX);
-
-    // At this point, EAX:EDX contains the 64bit dividend and op2->GetRegNum()
-    // contains the 32bit divisor. We want to generate the following code:
-    //
-    //   cmp edx, divisor->GetRegNum()
-    //   jb noOverflow
-    //
-    //   mov temp, eax
-    //   mov eax, edx
-    //   xor edx, edx
-    //   div divisor->GetRegNum()
-    //   mov eax, temp
-    //
-    // noOverflow:
-    //   div divisor->GetRegNum()
-    //
-    // This works because (a * 2^32 + b) % c = ((a % c) * 2^32 + b) % c.
-
-    BasicBlock* const noOverflow = genCreateTempLabel();
-
-    //   cmp edx, divisor->GetRegNum()
-    //   jb noOverflow
-    inst_RV_RV(INS_cmp, REG_EDX, divisor->GetRegNum());
-    inst_JMP(EJ_jb, noOverflow);
-
-    //   mov temp, eax
-    //   mov eax, edx
-    //   xor edx, edx
-    //   div divisor->GetRegNum()
-    //   mov eax, temp
-    const regNumber tempReg = node->GetSingleTempReg();
-    inst_RV_RV(INS_mov, tempReg, REG_EAX, TYP_INT);
-    inst_RV_RV(INS_mov, REG_EAX, REG_EDX, TYP_INT);
-    instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
-    inst_RV(INS_div, divisor->GetRegNum(), TYP_INT);
-    inst_RV_RV(INS_mov, REG_EAX, tempReg, TYP_INT);
-
-    // noOverflow:
-    //   div divisor->GetRegNum()
-    genDefineTempLabel(noOverflow);
-    inst_RV(INS_div, divisor->GetRegNum(), TYP_INT);
-
-    const regNumber targetReg = node->GetRegNum();
-    if (targetReg != REG_EDX)
-    {
-        inst_RV_RV(INS_mov, targetReg, REG_RDX, TYP_INT);
-    }
-    genProduceReg(node);
-}
-#endif // TARGET_X86
-
-//------------------------------------------------------------------------
-// genCodeForDivMod: Generate code for a DIV or MOD operation.
-//
-// Arguments:
-//    treeNode - the node to generate the code for
-//
-void CodeGen::genCodeForDivMod(GenTreeOp* treeNode)
-{
-    assert(treeNode->OperIs(GT_DIV, GT_UDIV, GT_MOD, GT_UMOD));
-
-    GenTree* dividend = treeNode->gtOp1;
-
-#ifdef TARGET_X86
-    if (varTypeIsLong(dividend->TypeGet()))
-    {
-        genCodeForLongUMod(treeNode);
-        return;
-    }
-#endif // TARGET_X86
-
-    GenTree*   divisor    = treeNode->gtOp2;
-    genTreeOps oper       = treeNode->OperGet();
-    emitAttr   size       = emitTypeSize(treeNode);
-    regNumber  targetReg  = treeNode->GetRegNum();
-    var_types  targetType = treeNode->TypeGet();
-    emitter*   emit       = GetEmitter();
-
-    // Node's type must be int/native int, small integer types are not
-    // supported and floating point types are handled by genCodeForBinary.
-    assert(varTypeIsIntOrI(targetType));
-    // dividend is in a register.
-    assert(dividend->isUsedFromReg());
-
-    genConsumeOperands(treeNode->AsOp());
-    // dividend must be in RAX
-    genCopyRegIfNeeded(dividend, REG_RAX);
-
-    // zero or sign extend rax to rdx
-    if (oper == GT_UMOD || oper == GT_UDIV ||
-        (dividend->IsIntegralConst() && (dividend->AsIntConCommon()->IconValue() > 0)))
-    {
-        instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EDX);
-    }
-    else
-    {
-        emit->emitIns(INS_cdq, size);
-        // the cdq instruction writes RDX, So clear the gcInfo for RDX
-        gcInfo.gcMarkRegSetNpt(RBM_RDX);
-    }
-
-    // Perform the 'targetType' (64-bit or 32-bit) divide instruction
-    instruction ins;
-    if (oper == GT_UMOD || oper == GT_UDIV)
-    {
-        ins = INS_div;
-    }
-    else
-    {
-        ins = INS_idiv;
-    }
-
-    emit->emitInsBinary(ins, size, treeNode, divisor);
-
-    // DIV/IDIV instructions always store the quotient in RAX and the remainder in RDX.
-    // Move the result to the desired register, if necessary
-    if (oper == GT_DIV || oper == GT_UDIV)
-    {
-        if (targetReg != REG_RAX)
-        {
-            inst_RV_RV(INS_mov, targetReg, REG_RAX, targetType);
-        }
-    }
-    else
-    {
-        assert((oper == GT_MOD) || (oper == GT_UMOD));
-        if (targetReg != REG_RDX)
-        {
-            inst_RV_RV(INS_mov, targetReg, REG_RDX, targetType);
-        }
-    }
-    genProduceReg(treeNode);
-}
-
-//------------------------------------------------------------------------
-// genCodeForBinary: Generate code for many binary arithmetic operators
-//
-// Arguments:
-//    treeNode - The binary operation for which we are generating code.
-//
-// Return Value:
-//    None.
-//
-// Notes:
-//    Integer MUL and DIV variants have special constraints on x64 so are not handled here.
-//    See the assert below for the operators that are handled.
-
-void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// genCodeForMul: Generate code for a MUL operation.
-//
-// Arguments:
-//    treeNode - the node to generate the code for
-//
-void CodeGen::genCodeForMul(GenTreeOp* treeNode)
-{
-    assert(false);
-}
-
-#ifdef FEATURE_SIMD
-
-//------------------------------------------------------------------------
-// genSIMDSplitReturn: Generates code for returning a fixed-size SIMD type that lives
-//                     in a single register, but is returned in multiple registers.
-//
-// Arguments:
-//    src         - The source of the return
-//    retTypeDesc - The return type descriptor.
-//
-void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc)
-{
-    assert(varTypeIsSIMD(src));
-    assert(src->isUsedFromReg());
-
-    // This is a case of operand is in a single reg and needs to be
-    // returned in multiple ABI return registers.
-    regNumber opReg = src->GetRegNum();
-    regNumber reg0  = retTypeDesc->GetABIReturnReg(0);
-    regNumber reg1  = retTypeDesc->GetABIReturnReg(1);
-
-    assert((reg0 != REG_NA) && (reg1 != REG_NA) && (opReg != REG_NA));
-
-    const bool srcIsFloatReg = genIsValidFloatReg(opReg);
-    const bool dstIsFloatReg = genIsValidFloatReg(reg0);
-    assert(srcIsFloatReg);
-
-#ifdef TARGET_AMD64
-    assert(src->TypeIs(TYP_SIMD16));
-    assert(srcIsFloatReg == dstIsFloatReg);
-    if (opReg != reg0 && opReg != reg1)
-    {
-        // Operand reg is different from return regs.
-        // Copy opReg to reg0 and let it to be handled by one of the
-        // two cases below.
-        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg0, opReg, TYP_SIMD16);
-        opReg = reg0;
-    }
-
-    if (opReg == reg0)
-    {
-        assert(opReg != reg1);
-        // reg1 = opReg.
-        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg1, opReg, TYP_SIMD16);
-    }
-    else
-    {
-        assert(opReg == reg1);
-
-        // reg0 = opReg.
-
-        inst_RV_RV(ins_Copy(opReg, TYP_SIMD16), reg0, opReg, TYP_SIMD16);
-    }
-    // reg0 - already has required 8-byte in bit position [63:0].
-    // swap upper and lower 8-bytes of reg1 so that desired 8-byte is in bit position [63:0].
-    inst_RV_RV_IV(INS_shufpd, EA_16BYTE, reg1, reg1, 0x01);
-
-#else  // TARGET_X86
-    assert(src->TypeIs(TYP_SIMD8));
-    assert(srcIsFloatReg != dstIsFloatReg);
-    assert((reg0 == REG_EAX) && (reg1 == REG_EDX));
-    // reg0 = opReg[31:0]
-    inst_RV_RV(ins_Copy(opReg, TYP_INT), reg0, opReg, TYP_INT);
-    // reg1 = opRef[61:32]
-    if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-    {
-        inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1);
-    }
-    else
-    {
-        int8_t shuffleMask = 1; // we only need [61:32]->[31:0], the rest is not read.
-        inst_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, src, shuffleMask);
-        inst_RV_RV(ins_Copy(opReg, TYP_INT), reg1, opReg, TYP_INT);
-    }
-#endif // TARGET_X86
-}
-
-#endif // FEATURE_SIMD
-
-#if defined(TARGET_X86)
-
-//------------------------------------------------------------------------
-// genFloatReturn: Generates code for float return statement for x86.
-//
-// Note: treeNode's and op1's registers are already consumed.
-//
-// Arguments:
-//    treeNode - The GT_RETURN or GT_RETFILT tree node with float type.
-//
-// Return Value:
-//    None
-//
-void CodeGen::genFloatReturn(GenTree* treeNode)
-{
-    assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
-    assert(varTypeIsFloating(treeNode));
-
-    GenTree* op1 = treeNode->gtGetOp1();
-    // Spill the return value register from an XMM register to the stack, then load it on the x87 stack.
-    // If it already has a home location, use that. Otherwise, we need a temp.
-    if (genIsRegCandidateLocal(op1) && compiler->lvaTable[op1->AsLclVarCommon()->GetLclNum()].lvOnFrame)
-    {
-        if (compiler->lvaTable[op1->AsLclVarCommon()->GetLclNum()].GetRegNum() != REG_STK)
-        {
-            op1->gtFlags |= GTF_SPILL;
-            inst_TT_RV(ins_Store(op1->gtType, compiler->isSIMDTypeLocalAligned(op1->AsLclVarCommon()->GetLclNum())),
-                       emitTypeSize(op1->TypeGet()), op1, op1->GetRegNum());
-        }
-        // Now, load it to the fp stack.
-        GetEmitter()->emitIns_S(INS_fld, emitTypeSize(op1), op1->AsLclVarCommon()->GetLclNum(), 0);
-    }
-    else
-    {
-        // Spill the value, which should be in a register, then load it to the fp stack.
-        // TODO-X86-CQ: Deal with things that are already in memory (don't call genConsumeReg yet).
-        op1->gtFlags |= GTF_SPILL;
-        regSet.rsSpillTree(op1->GetRegNum(), op1);
-        op1->gtFlags |= GTF_SPILLED;
-        op1->gtFlags &= ~GTF_SPILL;
-
-        TempDsc* t = regSet.rsUnspillInPlace(op1, op1->GetRegNum());
-        inst_FS_ST(INS_fld, emitActualTypeSize(op1->gtType), t, 0);
-        op1->gtFlags &= ~GTF_SPILLED;
-        regSet.tmpRlsTemp(t);
-    }
-}
-#endif // TARGET_X86
-
-//------------------------------------------------------------------------
-// genCodeForCompare: Produce code for a GT_EQ/GT_NE/GT_LT/GT_LE/GT_GE/GT_GT/GT_TEST_EQ/GT_TEST_NE/GT_CMP node.
-//
-// Arguments:
-//    tree - the node
-//
-void CodeGen::genCodeForCompare(GenTreeOp* tree)
-{
-    assert(tree->OperIs(GT_EQ, GT_NE, GT_LT, GT_LE, GT_GE, GT_GT, GT_TEST_EQ, GT_TEST_NE, GT_CMP));
-
-    // TODO-XArch-CQ: Check if we can use the currently set flags.
-    // TODO-XArch-CQ: Check for the case where we can simply transfer the carry bit to a register
-    //         (signed < or >= where targetReg != REG_NA)
-
-    GenTree*  op1     = tree->gtOp1;
-    var_types op1Type = op1->TypeGet();
-
-    if (varTypeIsFloating(op1Type))
-    {
-        genCompareFloat(tree);
-    }
-    else
-    {
-        genCompareInt(tree);
-    }
-}
-
-//------------------------------------------------------------------------
-// genCodeForBT: Generates code for a GT_BT node.
-//
-// Arguments:
-//    tree - The node.
-//
-//void CodeGen::genCodeForBT(GenTreeOp* bt)
-//{
-//    assert(false);
-//};
-// clang-format on
-
-// clang-format off
-const CodeGen::GenConditionDesc CodeGen::GenConditionDesc::map[32]
-{
-    { },        // NONE
-    { },        // 1
-    { EJ_jl  }, // SLT
-    { EJ_jle }, // SLE
-    { EJ_jge }, // SGE
-    { EJ_jg  }, // SGT
-    { EJ_js  }, // S
-    { EJ_jns }, // NS
-
-    { EJ_je  }, // EQ
-    { EJ_jne }, // NE
-    { EJ_jb  }, // ULT
-    { EJ_jbe }, // ULE
-    { EJ_jae }, // UGE
-    { EJ_ja  }, // UGT
-    { EJ_jb  }, // C
-    { EJ_jae }, // NC
-
-    // Floating point compare instructions (UCOMISS, UCOMISD etc.) set the condition flags as follows:
-    //    ZF PF CF  Meaning
-    //   ---------------------
-    //    1  1  1   Unordered
-    //    0  0  0   Greater
-    //    0  0  1   Less Than
-    //    1  0  0   Equal
-    //
-    // Since ZF and CF are also set when the result is unordered, in some cases we first need to check
-    // PF before checking ZF/CF. In general, ordered conditions will result in a jump only if PF is not
-    // set and unordered conditions will result in a jump only if PF is set.
-
-    { EJ_jnp, GT_AND, EJ_je  }, // FEQ
-    { EJ_jne                 }, // FNE
-    { EJ_jnp, GT_AND, EJ_jb  }, // FLT
-    { EJ_jnp, GT_AND, EJ_jbe }, // FLE
-    { EJ_jae                 }, // FGE
-    { EJ_ja                  }, // FGT
-    { EJ_jo                  }, // O
-    { EJ_jno                 }, // NO
-
-    { EJ_je                }, // FEQU
-    { EJ_jp, GT_OR, EJ_jne }, // FNEU
-    { EJ_jb                }, // FLTU
-    { EJ_jbe               }, // FLEU
-    { EJ_jp, GT_OR, EJ_jae }, // FGEU
-    { EJ_jp, GT_OR, EJ_ja  }, // FGTU
-    { EJ_jp                }, // P
-    { EJ_jnp               }, // NP
-};
-// clang-format on
-
-
-//------------------------------------------------------------------------
-// inst_SETCC: Generate code to set a register to 0 or 1 based on a condition.
-//
-// Arguments:
-//   condition - The condition
-//   type      - The type of the value to be produced
-//   dstReg    - The destination register to be set to 1 or 0
-//
-void CodeGen::inst_SETCC(GenCondition condition, var_types type, regNumber dstReg)
-{
-    assert(varTypeIsIntegral(type));
-    assert(genIsValidIntReg(dstReg) && isByteReg(dstReg));
-
-    const GenConditionDesc& desc = GenConditionDesc::Get(condition);
-
-    inst_SET(desc.jumpKind1, dstReg);
-
-    if (desc.oper != GT_NONE)
-    {
-        BasicBlock* labelNext = genCreateTempLabel();
-        inst_JMP((desc.oper == GT_OR) ? desc.jumpKind1 : emitter::emitReverseJumpKind(desc.jumpKind1), labelNext);
-        inst_SET(desc.jumpKind2, dstReg);
-        genDefineTempLabel(labelNext);
-    }
-
-    if (!varTypeIsByte(type))
-    {
-        GetEmitter()->emitIns_R_R(INS_movzx, EA_1BYTE, dstReg, dstReg);
-    }
-}
-
-//------------------------------------------------------------------------
-// genCodeForReturnTrap: Produce code for a GT_RETURNTRAP node.
-//
-// Arguments:
-//    tree - the GT_RETURNTRAP node
-//
-void CodeGen::genCodeForReturnTrap(GenTreeOp* tree)
-{
-    assert(tree->OperGet() == GT_RETURNTRAP);
-
-    // this is nothing but a conditional call to CORINFO_HELP_STOP_FOR_GC
-    // based on the contents of 'data'
-
-    GenTree* data = tree->gtOp1;
-    genConsumeRegs(data);
-    GenTreeIntCon cns = intForm(TYP_INT, 0);
-    cns.SetContained();
-    GetEmitter()->emitInsBinary(INS_cmp, emitTypeSize(TYP_INT), data, &cns);
-
-    BasicBlock* skipLabel = genCreateTempLabel();
-
-    inst_JMP(EJ_je, skipLabel);
-
-    // emit the call to the EE-helper that stops for GC (or other reasons)
-    regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
-    assert(genIsValidIntReg(tmpReg));
-
-    genEmitHelperCall(CORINFO_HELP_STOP_FOR_GC, 0, EA_UNKNOWN, tmpReg);
-    genDefineTempLabel(skipLabel);
-}
-
-/*****************************************************************************
- *
- * Generate code for a single node in the tree.
- * Preconditions: All operands have been evaluated
- *
- */
-void CodeGen::genCodeForTreeNode(GenTree* treeNode)
-{
-    assert(false);
-}
-
-#ifdef FEATURE_SIMD
-//----------------------------------------------------------------------------------
-// genMultiRegStoreToSIMDLocal: store multi-reg value to a single-reg SIMD local
-//
-// Arguments:
-//    lclNode  -  GentreeLclVar of GT_STORE_LCL_VAR
-//
-// Return Value:
-//    None
-//
-void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
-{
-#ifdef UNIX_AMD64_ABI
-    regNumber dst       = lclNode->GetRegNum();
-    GenTree*  op1       = lclNode->gtGetOp1();
-    GenTree*  actualOp1 = op1->gtSkipReloadOrCopy();
-    unsigned  regCount =
-        actualOp1->IsMultiRegLclVar() ? actualOp1->AsLclVar()->GetFieldCount(compiler) : actualOp1->GetMultiRegCount();
-    assert(op1->IsMultiRegNode());
-    genConsumeRegs(op1);
-
-    // Right now the only enregistrable structs supported are SIMD types.
-    // They are only returned in 1 or 2 registers - the 1 register case is
-    // handled as a regular STORE_LCL_VAR.
-    // This case is always a call (AsCall() will assert if it is not).
-    GenTreeCall*          call        = actualOp1->AsCall();
-    const ReturnTypeDesc* retTypeDesc = call->GetReturnTypeDesc();
-    assert(retTypeDesc->GetReturnRegCount() == MAX_RET_REG_COUNT);
-
-    assert(regCount == 2);
-    assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(0)));
-    assert(varTypeIsFloating(retTypeDesc->GetReturnRegType(1)));
-
-    // This is a case where the two 8-bytes that comprise the operand are in
-    // two different xmm registers and need to be assembled into a single
-    // xmm register.
-    regNumber targetReg = lclNode->GetRegNum();
-    regNumber reg0      = call->GetRegNumByIdx(0);
-    regNumber reg1      = call->GetRegNumByIdx(1);
-
-    if (op1->IsCopyOrReload())
-    {
-        // GT_COPY/GT_RELOAD will have valid reg for those positions
-        // that need to be copied or reloaded.
-        regNumber reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(0);
-        if (reloadReg != REG_NA)
-        {
-            reg0 = reloadReg;
-        }
-
-        reloadReg = op1->AsCopyOrReload()->GetRegNumByIdx(1);
-        if (reloadReg != REG_NA)
-        {
-            reg1 = reloadReg;
-        }
-    }
-
-    if (targetReg != reg0 && targetReg != reg1)
-    {
-        // targetReg = reg0;
-        // targetReg[127:64] = reg1[127:64]
-        inst_RV_RV(ins_Copy(TYP_DOUBLE), targetReg, reg0, TYP_DOUBLE);
-        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
-    }
-    else if (targetReg == reg0)
-    {
-        // (elided) targetReg = reg0
-        // targetReg[127:64] = reg1[127:64]
-        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg1, 0x00);
-    }
-    else
-    {
-        assert(targetReg == reg1);
-        // We need two shuffles to achieve this
-        // First:
-        // targetReg[63:0] = targetReg[63:0]
-        // targetReg[127:64] = reg0[63:0]
-        //
-        // Second:
-        // targetReg[63:0] = targetReg[127:64]
-        // targetReg[127:64] = targetReg[63:0]
-        //
-        // Essentially copy low 8-bytes from reg0 to high 8-bytes of targetReg
-        // and next swap low and high 8-bytes of targetReg to have them
-        // rearranged in the right order.
-        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, reg0, 0x00);
-        inst_RV_RV_IV(INS_shufpd, EA_16BYTE, targetReg, targetReg, 0x01);
-    }
-    genProduceReg(lclNode);
-#else  // !UNIX_AMD64_ABI
-    assert(!"Multireg store to SIMD reg not supported on X64 Windows");
-#endif // !UNIX_AMD64_ABI
-}
-#endif // FEATURE_SIMD
-
-//------------------------------------------------------------------------
-// genAllocLclFrame: Probe the stack and allocate the local stack frame - subtract from SP.
-//
-// Arguments:
-//      frameSize         - the size of the stack frame being allocated.
-//      initReg           - register to use as a scratch register.
-//      pInitRegZeroed    - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
-//                          this call sets 'initReg' to a non-zero value.
-//      maskArgRegsLiveIn - incoming argument registers that are currently live.
-//
-// Return value:
-//      None
-//
-void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn)
-{
-    assert(compiler->compGeneratingProlog);
-
-    if (frameSize == 0)
-    {
-        return;
-    }
-
-    const target_size_t pageSize = compiler->eeGetPageSize();
-
-    if (frameSize == REGSIZE_BYTES)
-    {
-        // Frame size is the same as register size.
-        GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX);
-        compiler->unwindAllocStack(frameSize);
-    }
-    else if (frameSize < pageSize)
-    {
-        GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
-        compiler->unwindAllocStack(frameSize);
-
-        const unsigned lastProbedLocToFinalSp = frameSize;
-
-        if (lastProbedLocToFinalSp + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize)
-        {
-            // We haven't probed almost a complete page. If the next action on the stack might subtract from SP
-            // first, before touching the current SP, then we need to probe at the very bottom. This can
-            // happen on x86, for example, when we copy an argument to the stack using a "SUB ESP; REP MOV"
-            // strategy.
-            GetEmitter()->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_SPBASE, 0);
-        }
-    }
-    else
-    {
-#ifdef TARGET_X86
-        int spOffset = -(int)frameSize;
-
-        if (compiler->info.compPublishStubParam)
-        {
-            GetEmitter()->emitIns_R(INS_push, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
-            spOffset += REGSIZE_BYTES;
-        }
-
-        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, spOffset);
-        regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);
-
-        genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);
-
-        if (compiler->info.compPublishStubParam)
-        {
-            GetEmitter()->emitIns_R(INS_pop, EA_PTRSIZE, REG_SECRET_STUB_PARAM);
-            GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize);
-        }
-        else
-        {
-            GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
-        }
-#else  // !TARGET_X86
-        static_assert_no_msg((RBM_STACK_PROBE_HELPER_ARG & (RBM_SECRET_STUB_PARAM | RBM_DEFAULT_HELPER_CALL_TARGET)) ==
-                             RBM_NONE);
-
-        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, -(int)frameSize);
-        regSet.verifyRegUsed(REG_STACK_PROBE_HELPER_ARG);
-
-        genEmitHelperCall(CORINFO_HELP_STACK_PROBE, 0, EA_UNKNOWN);
-
-        if (initReg == REG_DEFAULT_HELPER_CALL_TARGET)
-        {
-            *pInitRegZeroed = false;
-        }
-
-        static_assert_no_msg((RBM_STACK_PROBE_HELPER_TRASH & RBM_STACK_PROBE_HELPER_ARG) == RBM_NONE);
-
-        GetEmitter()->emitIns_R_R(INS_mov, EA_PTRSIZE, REG_SPBASE, REG_STACK_PROBE_HELPER_ARG);
-#endif // !TARGET_X86
-
-        compiler->unwindAllocStack(frameSize);
-
-        if (initReg == REG_STACK_PROBE_HELPER_ARG)
-        {
-            *pInitRegZeroed = false;
-        }
-    }
-
-#ifdef USING_SCOPE_INFO
-    if (!doubleAlignOrFramePointerUsed())
-    {
-        psiAdjustStackLevel(frameSize);
-    }
-#endif // USING_SCOPE_INFO
-}
-
-//------------------------------------------------------------------------
-// genStackPointerConstantAdjustment: add a specified constant value to the stack pointer.
-// No probe is done.
-//
-// Arguments:
-//    spDelta                 - the value to add to SP. Must be negative or zero.
-//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
-//                              adjustment from the emitter, using this register.
-//
-// Return Value:
-//    None.
-//
-void CodeGen::genStackPointerConstantAdjustment(ssize_t spDelta, regNumber regTmp)
-{
-    assert(spDelta < 0);
-
-    // We assert that the SP change is less than one page. If it's greater, you should have called a
-    // function that does a probe, which will in turn call this function.
-    assert((target_size_t)(-spDelta) <= compiler->eeGetPageSize());
-
-#ifdef TARGET_X86
-    if (regTmp != REG_NA)
-    {
-        // For x86, some cases don't want to use "sub ESP" because we don't want the emitter to track the adjustment
-        // to ESP. So do the work in the count register.
-        // TODO-CQ: manipulate ESP directly, to share code, reduce #ifdefs, and improve CQ. This would require
-        // creating a way to temporarily turn off the emitter's tracking of ESP, maybe marking instrDescs as "don't
-        // track".
-        inst_RV_RV(INS_mov, regTmp, REG_SPBASE, TYP_I_IMPL);
-        inst_RV_IV(INS_sub, regTmp, (target_ssize_t)-spDelta, EA_PTRSIZE);
-        inst_RV_RV(INS_mov, REG_SPBASE, regTmp, TYP_I_IMPL);
-    }
-    else
-#endif // TARGET_X86
-    {
-        inst_RV_IV(INS_sub, REG_SPBASE, (target_ssize_t)-spDelta, EA_PTRSIZE);
-    }
-}
-
-//------------------------------------------------------------------------
-// genStackPointerConstantAdjustmentWithProbe: add a specified constant value to the stack pointer,
-// and probe the stack as appropriate. Should only be called as a helper for
-// genStackPointerConstantAdjustmentLoopWithProbe.
-//
-// Arguments:
-//    spDelta                 - the value to add to SP. Must be negative or zero. If zero, the probe happens,
-//                              but the stack pointer doesn't move.
-//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
-//                              adjustment from the emitter, using this register.
-//
-// Return Value:
-//    None.
-//
-void CodeGen::genStackPointerConstantAdjustmentWithProbe(ssize_t spDelta, regNumber regTmp)
-{
-    GetEmitter()->emitIns_AR_R(INS_TEST, EA_4BYTE, REG_SPBASE, REG_SPBASE, 0);
-    genStackPointerConstantAdjustment(spDelta, regTmp);
-}
-
-//------------------------------------------------------------------------
-// genStackPointerConstantAdjustmentLoopWithProbe: Add a specified constant value to the stack pointer,
-// and probe the stack as appropriate. Generates one probe per page, up to the total amount required.
-// This will generate a sequence of probes in-line. It is required for the case where we need to expose
-// (not hide) the stack level adjustment. We can't use the dynamic loop in that case, because the total
-// stack adjustment would not be visible to the emitter. It would be possible to use this version for
-// multiple hidden constant stack level adjustments but we don't do that currently (we use the loop
-// version in genStackPointerDynamicAdjustmentWithProbe instead).
-//
-// Arguments:
-//    spDelta                 - the value to add to SP. Must be negative.
-//    regTmp                  - x86 only: an available temporary register. If not REG_NA, hide the SP
-//                              adjustment from the emitter, using this register.
-//
-// Return Value:
-//    Offset in bytes from SP to last probed address.
-//
-target_ssize_t CodeGen::genStackPointerConstantAdjustmentLoopWithProbe(ssize_t spDelta, regNumber regTmp)
-{
-    assert(spDelta < 0);
-
-    const target_size_t pageSize = compiler->eeGetPageSize();
-
-    ssize_t spRemainingDelta = spDelta;
-    do
-    {
-        ssize_t spOneDelta = -(ssize_t)min((target_size_t)-spRemainingDelta, pageSize);
-        genStackPointerConstantAdjustmentWithProbe(spOneDelta, regTmp);
-        spRemainingDelta -= spOneDelta;
-    } while (spRemainingDelta < 0);
-
-    // What offset from the final SP was the last probe? This depends on the fact that
-    // genStackPointerConstantAdjustmentWithProbe() probes first, then does "SUB SP".
-    target_size_t lastTouchDelta = (target_size_t)(-spDelta) % pageSize;
-    if ((lastTouchDelta == 0) || (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize))
-    {
-        // We haven't probed almost a complete page. If lastTouchDelta==0, then spDelta was an exact
-        // multiple of pageSize, which means we last probed exactly one page back. Otherwise, we probed
-        // the page, but very far from the end. If the next action on the stack might subtract from SP
-        // first, before touching the current SP, then we do one more probe at the very bottom. This can
-        // happen on x86, for example, when we copy an argument to the stack using a "SUB ESP; REP MOV"
-        // strategy.
-
-        GetEmitter()->emitIns_AR_R(INS_test, EA_PTRSIZE, REG_EAX, REG_SPBASE, 0);
-        lastTouchDelta = 0;
-    }
-
-    return lastTouchDelta;
-}
-
-//------------------------------------------------------------------------
-// genStackPointerDynamicAdjustmentWithProbe: add a register value to the stack pointer,
-// and probe the stack as appropriate.
-//
-// Note that for x86, we hide the ESP adjustment from the emitter. To do that, currently,
-// requires a temporary register and extra code.
-//
-// Arguments:
-//    regSpDelta              - the register value to add to SP. The value in this register must be negative.
-//                              This register might be trashed.
-//    regTmp                  - an available temporary register. Will be trashed.
-//
-// Return Value:
-//    None.
-//
-//void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta, regNumber regTmp)
-//{
-//    assert(false);
-//}
-
-//------------------------------------------------------------------------
-// genLclHeap: Generate code for localloc.
-//
-// Arguments:
-//      tree - the localloc tree to generate.
-//
-// Notes:
-//      Note that for x86, we don't track ESP movements while generating the localloc code.
-//      The ESP tracking is used to report stack pointer-relative GC info, which is not
-//      interesting while doing the localloc construction. Also, for functions with localloc,
-//      we have EBP frames, and EBP-relative locals, and ESP-relative accesses only for function
-//      call arguments.
-//
-//      For x86, we store the ESP after the localloc is complete in the LocAllocSP
-//      variable. This variable is implicitly reported to the VM in the GC info (its position
-//      is defined by convention relative to other items), and is used by the GC to find the
-//      "base" stack pointer in functions with localloc.
-//
-void CodeGen::genLclHeap(GenTree* tree)
-{
-    assert(false);
-}
-
-//
-//------------------------------------------------------------------------
-// genCodeForInitBlkRepStos: Generate code for InitBlk using rep stos.
-//
-// Arguments:
-//    initBlkNode - The Block store for which we are generating code.
-//
-void CodeGen::genCodeForInitBlkRepStos(GenTreeBlk* initBlkNode)
-{
-    genConsumeBlockOp(initBlkNode, REG_RDI, REG_RAX, REG_RCX);
-    instGen(INS_r_stosb);
-}
-
-//----------------------------------------------------------------------------------
-// genCodeForInitBlkUnroll: Generate unrolled block initialization code.
-//
-// Arguments:
-//    node - the GT_STORE_BLK node to generate code for
-//
-void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
-{
-    assert(node->OperIs(GT_STORE_BLK));
-
-    unsigned  dstLclNum         = BAD_VAR_NUM;
-    regNumber dstAddrBaseReg    = REG_NA;
-    regNumber dstAddrIndexReg   = REG_NA;
-    unsigned  dstAddrIndexScale = 1;
-    int       dstOffset         = 0;
-    GenTree*  dstAddr           = node->Addr();
-
-    if (!dstAddr->isContained())
-    {
-        dstAddrBaseReg = genConsumeReg(dstAddr);
-    }
-    else if (dstAddr->OperIsAddrMode())
-    {
-        GenTreeAddrMode* addrMode = dstAddr->AsAddrMode();
-
-        if (addrMode->HasBase())
-        {
-            dstAddrBaseReg = genConsumeReg(addrMode->Base());
-        }
-
-        if (addrMode->HasIndex())
-        {
-            dstAddrIndexReg   = genConsumeReg(addrMode->Index());
-            dstAddrIndexScale = addrMode->GetScale();
-        }
-
-        dstOffset = addrMode->Offset();
-    }
-    else
-    {
-        assert(dstAddr->OperIsLocalAddr());
-        dstLclNum = dstAddr->AsLclVarCommon()->GetLclNum();
-        dstOffset = dstAddr->AsLclVarCommon()->GetLclOffs();
-    }
-
-    regNumber srcIntReg = REG_NA;
-    GenTree*  src       = node->Data();
-
-    if (src->OperIs(GT_INIT_VAL))
-    {
-        assert(src->isContained());
-        src = src->AsUnOp()->gtGetOp1();
-    }
-
-    if (!src->isContained())
-    {
-        srcIntReg = genConsumeReg(src);
-    }
-    else
-    {
-        // If src is contained then it must be 0 and the size must be a multiple
-        // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
-        assert(src->IsIntegralConst(0));
-        assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
-    }
-
-    emitter* emit = GetEmitter();
-    unsigned size = node->GetLayout()->GetSize();
-
-    assert(size <= INT32_MAX);
-    assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
-
-    // Fill as much as possible using SSE2 stores.
-    if (size >= XMM_REGSIZE_BYTES)
-    {
-        regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
-
-        if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
-        {
-            // If the source is constant 0 then always use xorps, it's faster
-            // than copying the constant from a GPR to a XMM register.
-            emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg);
-        }
-        else
-        {
-            emit->emitIns_R_R(INS_mov_i2xmm, EA_PTRSIZE, srcXmmReg, srcIntReg);
-            emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
-#ifdef TARGET_X86
-            // For x86, we need one more to convert it from 8 bytes to 16 bytes.
-            emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
-#endif
-        }
-
-        instruction simdMov = simdUnalignedMovIns();
-        for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
-        {
-            if (dstLclNum != BAD_VAR_NUM)
-            {
-                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
-            }
-            else
-            {
-                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
-                                    dstAddrIndexScale, dstOffset);
-            }
-        }
-
-        // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
-        // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
-        // to allocate a GPR just for the remainder.
-    }
-
-    // Fill the remainder using normal stores.
-    for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
-    {
-        while (regSize > size)
-        {
-            regSize /= 2;
-        }
-
-        if (dstLclNum != BAD_VAR_NUM)
-        {
-            emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
-        }
-        else
-        {
-            emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
-                                dstAddrIndexScale, dstOffset);
-        }
-    }
-}
-
-#ifdef TARGET_AMD64
-//------------------------------------------------------------------------
-// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
-//
-// Arguments:
-//    initBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
-//
-// Preconditions:
-//   The register assignments have been set appropriately.
-//   This is validated by genConsumeBlockOp().
-//
-void CodeGen::genCodeForInitBlkHelper(GenTreeBlk* initBlkNode)
-{
-    // Destination address goes in arg0, source address goes in arg1, and size goes in arg2.
-    // genConsumeBlockOp takes care of this for us.
-    genConsumeBlockOp(initBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
-
-    genEmitHelperCall(CORINFO_HELP_MEMSET, 0, EA_UNKNOWN);
-}
-#endif // TARGET_AMD64
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-// Generate code for a load from some address + offset
-//   baseNode: tree node which can be either a local address or arbitrary node
-//   offset: distance from the baseNode from which to load
-void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* baseNode, unsigned offset)
-{
-    emitter* emit = GetEmitter();
-
-    if (baseNode->OperIsLocalAddr())
-    {
-        const GenTreeLclVarCommon* lclVar = baseNode->AsLclVarCommon();
-        offset += lclVar->GetLclOffs();
-        emit->emitIns_R_S(ins, size, dst, lclVar->GetLclNum(), offset);
-    }
-    else
-    {
-        emit->emitIns_R_AR(ins, size, dst, baseNode->GetRegNum(), offset);
-    }
-}
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-
-//----------------------------------------------------------------------------------
-// genCodeForCpBlkUnroll - Generate unrolled block copy code.
-//
-// Arguments:
-//    node - the GT_STORE_BLK node to generate code for
-//
-void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
-{
-    assert(node->OperIs(GT_STORE_BLK));
-
-    unsigned  dstLclNum         = BAD_VAR_NUM;
-    regNumber dstAddrBaseReg    = REG_NA;
-    regNumber dstAddrIndexReg   = REG_NA;
-    unsigned  dstAddrIndexScale = 1;
-    int       dstOffset         = 0;
-    GenTree*  dstAddr           = node->Addr();
-
-    if (!dstAddr->isContained())
-    {
-        dstAddrBaseReg = genConsumeReg(dstAddr);
-    }
-    else if (dstAddr->OperIsAddrMode())
-    {
-        GenTreeAddrMode* addrMode = dstAddr->AsAddrMode();
-
-        if (addrMode->HasBase())
-        {
-            dstAddrBaseReg = genConsumeReg(addrMode->Base());
-        }
-
-        if (addrMode->HasIndex())
-        {
-            dstAddrIndexReg   = genConsumeReg(addrMode->Index());
-            dstAddrIndexScale = addrMode->GetScale();
-        }
-
-        dstOffset = addrMode->Offset();
-    }
-    else
-    {
-        assert(dstAddr->OperIsLocalAddr());
-        const GenTreeLclVarCommon* lclVar = dstAddr->AsLclVarCommon();
-        dstLclNum                         = lclVar->GetLclNum();
-        dstOffset                         = lclVar->GetLclOffs();
-    }
-
-    unsigned  srcLclNum         = BAD_VAR_NUM;
-    regNumber srcAddrBaseReg    = REG_NA;
-    regNumber srcAddrIndexReg   = REG_NA;
-    unsigned  srcAddrIndexScale = 1;
-    int       srcOffset         = 0;
-    GenTree*  src               = node->Data();
-
-    assert(src->isContained());
-
-    if (src->OperIs(GT_LCL_VAR, GT_LCL_FLD))
-    {
-        srcLclNum = src->AsLclVarCommon()->GetLclNum();
-        srcOffset = src->AsLclVarCommon()->GetLclOffs();
-    }
-    else
-    {
-        assert(src->OperIs(GT_IND));
-        GenTree* srcAddr = src->AsIndir()->Addr();
-
-        if (!srcAddr->isContained())
-        {
-            srcAddrBaseReg = genConsumeReg(srcAddr);
-        }
-        else if (srcAddr->OperIsAddrMode())
-        {
-            GenTreeAddrMode* addrMode = srcAddr->AsAddrMode();
-
-            if (addrMode->HasBase())
-            {
-                srcAddrBaseReg = genConsumeReg(addrMode->Base());
-            }
-
-            if (addrMode->HasIndex())
-            {
-                srcAddrIndexReg   = genConsumeReg(addrMode->Index());
-                srcAddrIndexScale = addrMode->GetScale();
-            }
-
-            srcOffset = addrMode->Offset();
-        }
-        else
-        {
-            assert(srcAddr->OperIsLocalAddr());
-            srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum();
-            srcOffset = srcAddr->AsLclVarCommon()->GetLclOffs();
-        }
-    }
-
-    emitter* emit = GetEmitter();
-    unsigned size = node->GetLayout()->GetSize();
-
-    assert(size <= INT32_MAX);
-    assert(srcOffset < (INT32_MAX - static_cast<int>(size)));
-    assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
-
-    if (size >= XMM_REGSIZE_BYTES)
-    {
-        regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);
-
-        instruction simdMov = simdUnalignedMovIns();
-        for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
-             size -= regSize, srcOffset += regSize, dstOffset += regSize)
-        {
-            if (srcLclNum != BAD_VAR_NUM)
-            {
-                emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
-            }
-            else
-            {
-                emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
-                                    srcAddrIndexScale, srcOffset);
-            }
-
-            if (dstLclNum != BAD_VAR_NUM)
-            {
-                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
-            }
-            else
-            {
-                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
-                                    dstAddrIndexScale, dstOffset);
-            }
-        }
-
-        // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
-        // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
-        // allocate a GPR just for the remainder.
-    }
-
-    if (size > 0)
-    {
-        regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT);
-
-        for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize)
-        {
-            while (regSize > size)
-            {
-                regSize /= 2;
-            }
-
-            if (srcLclNum != BAD_VAR_NUM)
-            {
-                emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
-            }
-            else
-            {
-                emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
-                                    srcAddrIndexScale, srcOffset);
-            }
-
-            if (dstLclNum != BAD_VAR_NUM)
-            {
-                emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
-            }
-            else
-            {
-                emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
-                                    dstAddrIndexScale, dstOffset);
-            }
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------
-// genCodeForCpBlkRepMovs - Generate code for CpBlk by using rep movs
-//
-// Arguments:
-//    cpBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
-//
-// Preconditions:
-//   The register assignments have been set appropriately.
-//   This is validated by genConsumeBlockOp().
-//
-void CodeGen::genCodeForCpBlkRepMovs(GenTreeBlk* cpBlkNode)
-{
-    // Destination address goes in RDI, source address goes in RSE, and size goes in RCX.
-    // genConsumeBlockOp takes care of this for us.
-    genConsumeBlockOp(cpBlkNode, REG_RDI, REG_RSI, REG_RCX);
-    instGen(INS_r_movsb);
-}
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-//------------------------------------------------------------------------
-// CodeGen::genMove8IfNeeded: Conditionally move 8 bytes of a struct to the argument area
-//
-// Arguments:
-//    size       - The size of bytes remaining to be moved
-//    longTmpReg - The tmp register to be used for the long value
-//    srcAddr    - The address of the source struct
-//    offset     - The current offset being copied
-//
-// Return Value:
-//    Returns the number of bytes moved (8 or 0).
-//
-// Notes:
-//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
-//    not an even multiple of 16.
-//    On x86, longTmpReg must be an xmm reg; on x64 it must be an integer register.
-//    This is checked by genStoreRegToStackArg.
-//
-unsigned CodeGen::genMove8IfNeeded(unsigned size, regNumber longTmpReg, GenTree* srcAddr, unsigned offset)
-{
-#ifdef TARGET_X86
-    instruction longMovIns = INS_movq;
-#else  // !TARGET_X86
-    instruction longMovIns = INS_mov;
-#endif // !TARGET_X86
-    if ((size & 8) != 0)
-    {
-        genCodeForLoadOffset(longMovIns, EA_8BYTE, longTmpReg, srcAddr, offset);
-        genStoreRegToStackArg(TYP_LONG, longTmpReg, offset);
-        return 8;
-    }
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// CodeGen::genMove4IfNeeded: Conditionally move 4 bytes of a struct to the argument area
-//
-// Arguments:
-//    size      - The size of bytes remaining to be moved
-//    intTmpReg - The tmp register to be used for the long value
-//    srcAddr   - The address of the source struct
-//    offset    - The current offset being copied
-//
-// Return Value:
-//    Returns the number of bytes moved (4 or 0).
-//
-// Notes:
-//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
-//    not an even multiple of 16.
-//    intTmpReg must be an integer register.
-//    This is checked by genStoreRegToStackArg.
-//
-unsigned CodeGen::genMove4IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
-{
-    if ((size & 4) != 0)
-    {
-        genCodeForLoadOffset(INS_mov, EA_4BYTE, intTmpReg, srcAddr, offset);
-        genStoreRegToStackArg(TYP_INT, intTmpReg, offset);
-        return 4;
-    }
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// CodeGen::genMove2IfNeeded: Conditionally move 2 bytes of a struct to the argument area
-//
-// Arguments:
-//    size      - The size of bytes remaining to be moved
-//    intTmpReg - The tmp register to be used for the long value
-//    srcAddr   - The address of the source struct
-//    offset    - The current offset being copied
-//
-// Return Value:
-//    Returns the number of bytes moved (2 or 0).
-//
-// Notes:
-//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
-//    not an even multiple of 16.
-//    intTmpReg must be an integer register.
-//    This is checked by genStoreRegToStackArg.
-//
-unsigned CodeGen::genMove2IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
-{
-    if ((size & 2) != 0)
-    {
-        genCodeForLoadOffset(INS_mov, EA_2BYTE, intTmpReg, srcAddr, offset);
-        genStoreRegToStackArg(TYP_SHORT, intTmpReg, offset);
-        return 2;
-    }
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// CodeGen::genMove1IfNeeded: Conditionally move 1 byte of a struct to the argument area
-//
-// Arguments:
-//    size      - The size of bytes remaining to be moved
-//    intTmpReg - The tmp register to be used for the long value
-//    srcAddr   - The address of the source struct
-//    offset    - The current offset being copied
-//
-// Return Value:
-//    Returns the number of bytes moved (1 or 0).
-//
-// Notes:
-//    This is used in the PutArgStkKindUnroll case, to move any bytes that are
-//    not an even multiple of 16.
-//    intTmpReg must be an integer register.
-//    This is checked by genStoreRegToStackArg.
-//
-unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* srcAddr, unsigned offset)
-{
-    if ((size & 1) != 0)
-    {
-        genCodeForLoadOffset(INS_mov, EA_1BYTE, intTmpReg, srcAddr, offset);
-        genStoreRegToStackArg(TYP_BYTE, intTmpReg, offset);
-        return 1;
-    }
-    return 0;
-}
-
-//---------------------------------------------------------------------------------------------------------------//
-// genStructPutArgUnroll: Generates code for passing a struct arg on stack by value using loop unrolling.
-//
-// Arguments:
-//     putArgNode  - the PutArgStk tree.
-//
-// Notes:
-//     m_stkArgVarNum must be set to the base var number, relative to which the by-val struct will be copied to the
-//     stack.
-//
-// TODO-Amd64-Unix: Try to share code with copyblk.
-//      Need refactoring of copyblk before it could be used for putarg_stk.
-//      The difference for now is that a putarg_stk contains its children, while cpyblk does not.
-//      This creates differences in code. After some significant refactoring it could be reused.
-//
-void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
-{
-    GenTree* src = putArgNode->AsOp()->gtOp1;
-    // We will never call this method for SIMD types, which are stored directly
-    // in genPutStructArgStk().
-    noway_assert(src->TypeGet() == TYP_STRUCT);
-
-    unsigned size = putArgNode->GetStackByteSize();
-    assert(size <= CPBLK_UNROLL_LIMIT);
-
-    emitter* emit         = GetEmitter();
-    unsigned putArgOffset = putArgNode->getArgOffset();
-
-    assert(src->isContained());
-
-    assert(src->gtOper == GT_OBJ);
-
-    if (src->AsOp()->gtOp1->isUsedFromReg())
-    {
-        genConsumeReg(src->AsOp()->gtOp1);
-    }
-
-    unsigned offset = 0;
-
-    regNumber xmmTmpReg  = REG_NA;
-    regNumber intTmpReg  = REG_NA;
-    regNumber longTmpReg = REG_NA;
-#ifdef TARGET_X86
-    // On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
-    // less than 16 bytes, we will just be using pushes
-    if (size >= 8)
-    {
-        xmmTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
-        longTmpReg = xmmTmpReg;
-    }
-    if ((size & 0x7) != 0)
-    {
-        intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
-    }
-#else  // !TARGET_X86
-    // On x64 we use an XMM register only for 16-byte chunks.
-    if (size >= XMM_REGSIZE_BYTES)
-    {
-        xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
-    }
-    if ((size & 0xf) != 0)
-    {
-        intTmpReg  = putArgNode->GetSingleTempReg(RBM_ALLINT);
-        longTmpReg = intTmpReg;
-    }
-#endif // !TARGET_X86
-
-    // If the size of this struct is larger than 16 bytes
-    // let's use SSE2 to be able to do 16 byte at a time
-    // loads and stores.
-    if (size >= XMM_REGSIZE_BYTES)
-    {
-#ifdef TARGET_X86
-        assert(!m_pushStkArg);
-#endif // TARGET_X86
-        size_t slots = size / XMM_REGSIZE_BYTES;
-
-        assert(putArgNode->gtGetOp1()->isContained());
-        assert(putArgNode->gtGetOp1()->AsOp()->gtOper == GT_OBJ);
-
-        // TODO: In the below code the load and store instructions are for 16 bytes, but the
-        //          type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
-        //          this probably needs to be changed.
-        while (slots-- > 0)
-        {
-            // Load
-            genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
-
-            // Store
-            genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
-
-            offset += XMM_REGSIZE_BYTES;
-        }
-    }
-
-    // Fill the remainder (15 bytes or less) if there's one.
-    if ((size & 0xf) != 0)
-    {
-#ifdef TARGET_X86
-        if (m_pushStkArg)
-        {
-            // This case is currently supported only for the case where the total size is
-            // less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
-            // order. However, morph has ensured that we have a struct that is an even
-            // multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
-            assert(((size & 0xc) == size) && (offset == 0));
-            // If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
-            // whether we've got an 8 byte chunk, and then push it on the stack.
-            unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, size & 0x8);
-            // Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
-            // and push it on the stack.
-            pushedBytes += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, 0);
-        }
-        else
-#endif // TARGET_X86
-        {
-            offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset);
-            offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
-            offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
-            offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
-            assert(offset == size);
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// genStructPutArgRepMovs: Generates code for passing a struct arg by value on stack using Rep Movs.
-//
-// Arguments:
-//     putArgNode  - the PutArgStk tree.
-//
-// Preconditions:
-//     m_stkArgVarNum must be set to the base var number, relative to which the by-val struct bits will go.
-//
-void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
-{
-    GenTree* srcAddr = putArgNode->gtGetOp1();
-    assert(srcAddr->TypeGet() == TYP_STRUCT);
-
-    // Make sure we got the arguments of the cpblk operation in the right registers, and that
-    // 'srcAddr' is contained as expected.
-    assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI));
-    assert(srcAddr->isContained());
-
-    genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX);
-    instGen(INS_r_movsb);
-}
-
-//------------------------------------------------------------------------
-// If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits
-// must be cleared to zeroes. The native compiler doesn't clear the upper bits
-// and there is no way to know if the caller is native or not. So, the upper
-// 32 bits of Vector argument on stack are always cleared to zero.
-#if defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
-void CodeGen::genClearStackVec3ArgUpperBits()
-{
-#ifdef DEBUG
-    if (verbose)
-    {
-        printf("*************** In genClearStackVec3ArgUpperBits()\n");
-    }
-#endif
-
-    assert(compiler->compGeneratingProlog);
-
-    unsigned varNum = 0;
-
-    for (unsigned varNum = 0; varNum < compiler->info.compArgsCount; varNum++)
-    {
-        LclVarDsc* varDsc = &(compiler->lvaTable[varNum]);
-        assert(varDsc->lvIsParam);
-
-        // Does var has simd12 type?
-        if (varDsc->lvType != TYP_SIMD12)
-        {
-            continue;
-        }
-
-        if (!varDsc->lvIsRegArg)
-        {
-            // Clear the upper 32 bits by mov dword ptr [V_ARG_BASE+0xC], 0
-            GetEmitter()->emitIns_S_I(ins_Store(TYP_INT), EA_4BYTE, varNum, genTypeSize(TYP_FLOAT) * 3, 0);
-        }
-        else
-        {
-            // Assume that for x64 linux, an argument is fully in registers
-            // or fully on stack.
-            regNumber argReg = varDsc->GetOtherArgReg();
-
-            // Clear the upper 32 bits by two shift instructions.
-            // argReg = argReg << 96
-            GetEmitter()->emitIns_R_I(INS_pslldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
-            // argReg = argReg >> 96
-            GetEmitter()->emitIns_R_I(INS_psrldq, emitActualTypeSize(TYP_SIMD12), argReg, 12);
-        }
-    }
-}
-#endif // defined(UNIX_AMD64_ABI) && defined(FEATURE_SIMD)
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-
-//
-// genCodeForCpObj - Generate code for CpObj nodes to copy structs that have interleaved
-//                   GC pointers.
-//
-// Arguments:
-//    cpObjNode - the GT_STORE_OBJ
-//
-// Notes:
-//    This will generate a sequence of movsp instructions for the cases of non-gc members.
-//    Note that movsp is an alias for movsd on x86 and movsq on x64.
-//    and calls to the BY_REF_ASSIGN helper otherwise.
-//
-// Preconditions:
-//    The register assignments have been set appropriately.
-//    This is validated by genConsumeBlockOp().
-//
-void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
-{
-    assert(false);
-}
-
-#ifdef TARGET_AMD64
-//----------------------------------------------------------------------------------
-// genCodeForCpBlkHelper - Generate code for a CpBlk node by the means of the VM memcpy helper call
-//
-// Arguments:
-//    cpBlkNode - the GT_STORE_[BLK|OBJ|DYN_BLK]
-//
-// Preconditions:
-//   The register assignments have been set appropriately.
-//   This is validated by genConsumeBlockOp().
-//
-void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
-{
-    // Destination address goes in arg0, source address goes in arg1, and size goes in arg2.
-    // genConsumeBlockOp takes care of this for us.
-    genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2);
-
-    genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN);
-}
-#endif // TARGET_AMD64
-
-// generate code do a switch statement based on a table of ip-relative offsets
-void CodeGen::genTableBasedSwitch(GenTree* treeNode)
-{
-    genConsumeOperands(treeNode->AsOp());
-    regNumber idxReg  = treeNode->AsOp()->gtOp1->GetRegNum();
-    regNumber baseReg = treeNode->AsOp()->gtOp2->GetRegNum();
-
-    regNumber tmpReg = treeNode->GetSingleTempReg();
-
-    // load the ip-relative offset (which is relative to start of fgFirstBB)
-    GetEmitter()->emitIns_R_ARX(INS_mov, EA_4BYTE, baseReg, baseReg, idxReg, 4, 0);
-
-    // add it to the absolute address of fgFirstBB
-    compiler->fgFirstBB->bbFlags |= BBF_JMP_TARGET;
-    GetEmitter()->emitIns_R_L(INS_lea, EA_PTR_DSP_RELOC, compiler->fgFirstBB, tmpReg);
-    GetEmitter()->emitIns_R_R(INS_add, EA_PTRSIZE, baseReg, tmpReg);
-    // jmp baseReg
-    GetEmitter()->emitIns_R(INS_i_jmp, emitTypeSize(TYP_I_IMPL), baseReg);
-}
-
-// emits the table and an instruction to get the address of the first element
-void CodeGen::genJumpTable(GenTree* treeNode)
-{
-    noway_assert(compiler->compCurBB->bbJumpKind == BBJ_SWITCH);
-    assert(treeNode->OperGet() == GT_JMPTABLE);
-
-    unsigned     jumpCount = compiler->compCurBB->bbJumpSwt->bbsCount;
-    BasicBlock** jumpTable = compiler->compCurBB->bbJumpSwt->bbsDstTab;
-    unsigned     jmpTabOffs;
-    unsigned     jmpTabBase;
-
-    jmpTabBase = GetEmitter()->emitBBTableDataGenBeg(jumpCount, true);
-
-    jmpTabOffs = 0;
-
-    JITDUMP("\n      J_M%03u_DS%02u LABEL   DWORD\n", compiler->compMethodID, jmpTabBase);
-
-    for (unsigned i = 0; i < jumpCount; i++)
-    {
-        BasicBlock* target = *jumpTable++;
-        noway_assert(target->bbFlags & BBF_JMP_TARGET);
-
-        JITDUMP("            DD      L_M%03u_" FMT_BB "\n", compiler->compMethodID, target->bbNum);
-
-        GetEmitter()->emitDataGenData(i, target);
-    };
-
-    GetEmitter()->emitDataGenEnd();
-
-    // Access to inline data is 'abstracted' by a special type of static member
-    // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
-    // to constant data, not a real static field.
-    GetEmitter()->emitIns_R_C(INS_lea, emitTypeSize(TYP_I_IMPL), treeNode->GetRegNum(),
-                              compiler->eeFindJitDataOffs(jmpTabBase), 0);
-    genProduceReg(treeNode);
-}
-
-//------------------------------------------------------------------------
-// genCodeForLockAdd: Generate code for a GT_LOCKADD node
-//
-// Arguments:
-//    node - the GT_LOCKADD node
-//
-//void CodeGen::genCodeForLockAdd(GenTreeOp* node)
-//{
-//    assert(false);
-//}
-
-//------------------------------------------------------------------------
-// genLockedInstructions: Generate code for a GT_XADD or GT_XCHG node.
-//
-// Arguments:
-//    node - the GT_XADD/XCHG node
-//
-void CodeGen::genLockedInstructions(GenTreeOp* node)
-{
-    assert(node->OperIs(GT_XADD, GT_XCHG));
-
-    GenTree* addr = node->gtGetOp1();
-    GenTree* data = node->gtGetOp2();
-    emitAttr size = emitTypeSize(node->TypeGet());
-
-    assert(addr->isUsedFromReg());
-    assert(data->isUsedFromReg());
-    assert((size == EA_4BYTE) || (size == EA_PTRSIZE));
-
-    genConsumeOperands(node);
-
-    if (node->GetRegNum() != data->GetRegNum())
-    {
-        // If the destination register is different from the data register then we need
-        // to first move the data to the target register. Make sure we don't overwrite
-        // the address, the register allocator should have taken care of this.
-        assert(node->GetRegNum() != addr->GetRegNum());
-        GetEmitter()->emitIns_R_R(INS_mov, size, node->GetRegNum(), data->GetRegNum());
-    }
-
-    instruction ins = node->OperIs(GT_XADD) ? INS_xadd : INS_xchg;
-
-    // XCHG has an implied lock prefix when the first operand is a memory operand.
-    if (ins != INS_xchg)
-    {
-        instGen(INS_lock);
-    }
-
-    GetEmitter()->emitIns_AR_R(ins, size, node->GetRegNum(), addr->GetRegNum(), 0);
-    genProduceReg(node);
-}
-
-//------------------------------------------------------------------------
-// genCodeForCmpXchg: Produce code for a GT_CMPXCHG node.
-//
-// Arguments:
-//    tree - the GT_CMPXCHG node
-//
-void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* tree)
-{
-    assert(tree->OperIs(GT_CMPXCHG));
-
-    var_types targetType = tree->TypeGet();
-    regNumber targetReg  = tree->GetRegNum();
-
-    GenTree* location  = tree->gtOpLocation;  // arg1
-    GenTree* value     = tree->gtOpValue;     // arg2
-    GenTree* comparand = tree->gtOpComparand; // arg3
-
-    assert(location->GetRegNum() != REG_NA && location->GetRegNum() != REG_RAX);
-    assert(value->GetRegNum() != REG_NA && value->GetRegNum() != REG_RAX);
-
-    genConsumeReg(location);
-    genConsumeReg(value);
-    genConsumeReg(comparand);
-
-    // comparand goes to RAX;
-    // Note that we must issue this move after the genConsumeRegs(), in case any of the above
-    // have a GT_COPY from RAX.
-    if (comparand->GetRegNum() != REG_RAX)
-    {
-        inst_RV_RV(ins_Copy(comparand->TypeGet()), REG_RAX, comparand->GetRegNum(), comparand->TypeGet());
-    }
-
-    // location is Rm
-    instGen(INS_lock);
-
-    GetEmitter()->emitIns_AR_R(INS_cmpxchg, emitTypeSize(targetType), value->GetRegNum(), location->GetRegNum(), 0);
-
-    // Result is in RAX
-    if (targetReg != REG_RAX)
-    {
-        inst_RV_RV(ins_Copy(targetType), targetReg, REG_RAX, targetType);
-    }
-
-    genProduceReg(tree);
-}
-
-// generate code for BoundsCheck nodes
-void CodeGen::genRangeCheck(GenTree* oper)
-{
-    noway_assert(oper->OperIsBoundsCheck());
-    GenTreeBoundsChk* bndsChk = oper->AsBoundsChk();
-
-    GenTree* arrIndex = bndsChk->gtIndex;
-    GenTree* arrLen   = bndsChk->gtArrLen;
-
-    GenTree *    src1, *src2;
-    emitJumpKind jmpKind;
-    instruction  cmpKind;
-
-    genConsumeRegs(arrIndex);
-    genConsumeRegs(arrLen);
-
-    if (arrIndex->IsIntegralConst(0) && arrLen->isUsedFromReg())
-    {
-        // arrIndex is 0 and arrLen is in a reg. In this case
-        // we can generate
-        //      test reg, reg
-        // since arrLen is non-negative
-        src1    = arrLen;
-        src2    = arrLen;
-        jmpKind = EJ_je;
-        cmpKind = INS_test;
-    }
-    else if (arrIndex->isContainedIntOrIImmed())
-    {
-        // arrIndex is a contained constant.  In this case
-        // we will generate one of the following
-        //      cmp [mem], immed    (if arrLen is a memory op)
-        //      cmp reg, immed      (if arrLen is in a reg)
-        //
-        // That is arrLen cannot be a contained immed.
-        assert(!arrLen->isContainedIntOrIImmed());
-
-        src1    = arrLen;
-        src2    = arrIndex;
-        jmpKind = EJ_jbe;
-        cmpKind = INS_cmp;
-    }
-    else
-    {
-        // arrIndex could either be a contained memory op or a reg
-        // In this case we will generate one of the following
-        //      cmp  [mem], immed   (if arrLen is a constant)
-        //      cmp  [mem], reg     (if arrLen is in a reg)
-        //      cmp  reg, immed     (if arrIndex is in a reg)
-        //      cmp  reg1, reg2     (if arrIndex is in reg1)
-        //      cmp  reg, [mem]     (if arrLen is a memory op)
-        //
-        // That is only one of arrIndex or arrLen can be a memory op.
-        assert(!arrIndex->isUsedFromMemory() || !arrLen->isUsedFromMemory());
-
-        src1    = arrIndex;
-        src2    = arrLen;
-        jmpKind = EJ_jae;
-        cmpKind = INS_cmp;
-    }
-
-    var_types bndsChkType = src2->TypeGet();
-#if DEBUG
-    // Bounds checks can only be 32 or 64 bit sized comparisons.
-    assert(bndsChkType == TYP_INT || bndsChkType == TYP_LONG);
-
-    // The type of the bounds check should always wide enough to compare against the index.
-    assert(emitTypeSize(bndsChkType) >= emitTypeSize(src1->TypeGet()));
-#endif // DEBUG
-
-    GetEmitter()->emitInsBinary(cmpKind, emitTypeSize(bndsChkType), src1, src2);
-    genJumpToThrowHlpBlk(jmpKind, bndsChk->gtThrowKind, bndsChk->gtIndRngFailBB);
-}
-
-//---------------------------------------------------------------------
-// genCodeForPhysReg - generate code for a GT_PHYSREG node
-//
-// Arguments
-//    tree - the GT_PHYSREG node
-//
-// Return value:
-//    None
-//
-void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree)
-{
-    assert(tree->OperIs(GT_PHYSREG));
-
-    var_types targetType = tree->TypeGet();
-    regNumber targetReg  = tree->GetRegNum();
-
-    if (targetReg != tree->gtSrcReg)
-    {
-        inst_RV_RV(ins_Copy(targetType), targetReg, tree->gtSrcReg, targetType);
-        genTransferRegGCState(targetReg, tree->gtSrcReg);
-    }
-
-    genProduceReg(tree);
-}
-
-//---------------------------------------------------------------------
-// genCodeForNullCheck - generate code for a GT_NULLCHECK node
-//
-// Arguments
-//    tree - the GT_NULLCHECK node
-//
-// Return value:
-//    None
-//
-void CodeGen::genCodeForNullCheck(GenTreeIndir* tree)
-{
-    assert(tree->OperIs(GT_NULLCHECK));
-
-    assert(tree->gtOp1->isUsedFromReg());
-    regNumber reg = genConsumeReg(tree->gtOp1);
-    GetEmitter()->emitIns_AR_R(INS_cmp, EA_4BYTE, reg, reg, 0);
-}
-
-//------------------------------------------------------------------------
-// genOffsetOfMDArrayLowerBound: Returns the offset from the Array object to the
-//   lower bound for the given dimension.
-//
-// Arguments:
-//    elemType  - the element type of the array
-//    rank      - the rank of the array
-//    dimension - the dimension for which the lower bound offset will be returned.
-//
-// Return Value:
-//    The offset.
-
-unsigned CodeGen::genOffsetOfMDArrayLowerBound(var_types elemType, unsigned rank, unsigned dimension)
-{
-    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
-    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * (dimension + rank);
-}
-
-//------------------------------------------------------------------------
-// genOffsetOfMDArrayLength: Returns the offset from the Array object to the
-//   size for the given dimension.
-//
-// Arguments:
-//    elemType  - the element type of the array
-//    rank      - the rank of the array
-//    dimension - the dimension for which the lower bound offset will be returned.
-//
-// Return Value:
-//    The offset.
-
-unsigned CodeGen::genOffsetOfMDArrayDimensionSize(var_types elemType, unsigned rank, unsigned dimension)
-{
-    // Note that the lower bound and length fields of the Array object are always TYP_INT, even on 64-bit targets.
-    return compiler->eeGetArrayDataOffset(elemType) + genTypeSize(TYP_INT) * dimension;
-}
-
-//------------------------------------------------------------------------
-// genCodeForArrIndex: Generates code to bounds check the index for one dimension of an array reference,
-//                     producing the effective index by subtracting the lower bound.
-//
-// Arguments:
-//    arrIndex - the node for which we're generating code
-//
-// Return Value:
-//    None.
-//
-
-void CodeGen::genCodeForArrIndex(GenTreeArrIndex* arrIndex)
-{
-    GenTree* arrObj    = arrIndex->ArrObj();
-    GenTree* indexNode = arrIndex->IndexExpr();
-
-    regNumber arrReg   = genConsumeReg(arrObj);
-    regNumber indexReg = genConsumeReg(indexNode);
-    regNumber tgtReg   = arrIndex->GetRegNum();
-
-    unsigned  dim      = arrIndex->gtCurrDim;
-    unsigned  rank     = arrIndex->gtArrRank;
-    var_types elemType = arrIndex->gtArrElemType;
-
-    noway_assert(tgtReg != REG_NA);
-
-    // Subtract the lower bound for this dimension.
-    // TODO-XArch-CQ: make this contained if it's an immediate that fits.
-    if (tgtReg != indexReg)
-    {
-        inst_RV_RV(INS_mov, tgtReg, indexReg, indexNode->TypeGet());
-    }
-    GetEmitter()->emitIns_R_AR(INS_sub, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
-                               genOffsetOfMDArrayLowerBound(elemType, rank, dim));
-    GetEmitter()->emitIns_R_AR(INS_cmp, emitActualTypeSize(TYP_INT), tgtReg, arrReg,
-                               genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
-    genJumpToThrowHlpBlk(EJ_jae, SCK_RNGCHK_FAIL);
-
-    genProduceReg(arrIndex);
-}
-
-//------------------------------------------------------------------------
-// genCodeForArrOffset: Generates code to compute the flattened array offset for
-//    one dimension of an array reference:
-//        result = (prevDimOffset * dimSize) + effectiveIndex
-//    where dimSize is obtained from the arrObj operand
-//
-// Arguments:
-//    arrOffset - the node for which we're generating code
-//
-// Return Value:
-//    None.
-//
-// Notes:
-//    dimSize and effectiveIndex are always non-negative, the former by design,
-//    and the latter because it has been normalized to be zero-based.
-
-void CodeGen::genCodeForArrOffset(GenTreeArrOffs* arrOffset)
-{
-    GenTree* offsetNode = arrOffset->gtOffset;
-    GenTree* indexNode  = arrOffset->gtIndex;
-    GenTree* arrObj     = arrOffset->gtArrObj;
-
-    regNumber tgtReg = arrOffset->GetRegNum();
-    assert(tgtReg != REG_NA);
-
-    unsigned  dim      = arrOffset->gtCurrDim;
-    unsigned  rank     = arrOffset->gtArrRank;
-    var_types elemType = arrOffset->gtArrElemType;
-
-    // First, consume the operands in the correct order.
-    regNumber offsetReg = REG_NA;
-    regNumber tmpReg    = REG_NA;
-    if (!offsetNode->IsIntegralConst(0))
-    {
-        offsetReg = genConsumeReg(offsetNode);
-
-        // We will use a temp register for the offset*scale+effectiveIndex computation.
-        tmpReg = arrOffset->GetSingleTempReg();
-    }
-    else
-    {
-        assert(offsetNode->isContained());
-    }
-    regNumber indexReg = genConsumeReg(indexNode);
-    // Although arrReg may not be used in the constant-index case, if we have generated
-    // the value into a register, we must consume it, otherwise we will fail to end the
-    // live range of the gc ptr.
-    // TODO-CQ: Currently arrObj will always have a register allocated to it.
-    // We could avoid allocating a register for it, which would be of value if the arrObj
-    // is an on-stack lclVar.
-    regNumber arrReg = REG_NA;
-    if (arrObj->gtHasReg())
-    {
-        arrReg = genConsumeReg(arrObj);
-    }
-
-    if (!offsetNode->IsIntegralConst(0))
-    {
-        assert(tmpReg != REG_NA);
-        assert(arrReg != REG_NA);
-
-        // Evaluate tgtReg = offsetReg*dim_size + indexReg.
-        // tmpReg is used to load dim_size and the result of the multiplication.
-        // Note that dim_size will never be negative.
-
-        GetEmitter()->emitIns_R_AR(INS_mov, emitActualTypeSize(TYP_INT), tmpReg, arrReg,
-                                   genOffsetOfMDArrayDimensionSize(elemType, rank, dim));
-        inst_RV_RV(INS_imul, tmpReg, offsetReg);
-
-        if (tmpReg == tgtReg)
-        {
-            inst_RV_RV(INS_add, tmpReg, indexReg);
-        }
-        else
-        {
-            if (indexReg != tgtReg)
-            {
-                inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_I_IMPL);
-            }
-            inst_RV_RV(INS_add, tgtReg, tmpReg);
-        }
-    }
-    else
-    {
-        if (indexReg != tgtReg)
-        {
-            inst_RV_RV(INS_mov, tgtReg, indexReg, TYP_INT);
-        }
-    }
-    genProduceReg(arrOffset);
-}
-
-instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type)
-{
-    instruction ins;
-
-    // Operations on SIMD vectors shouldn't come this path
-    assert(!varTypeIsSIMD(type));
-    if (varTypeIsFloating(type))
-    {
-        return ins_MathOp(oper, type);
-    }
-
-    switch (oper)
-    {
-        case GT_ADD:
-            ins = INS_add;
-            break;
-        case GT_AND:
-            ins = INS_and;
-            break;
-        case GT_LSH:
-            ins = INS_shl;
-            break;
-        case GT_MUL:
-            ins = INS_imul;
-            break;
-        case GT_NEG:
-            ins = INS_neg;
-            break;
-        case GT_NOT:
-            ins = INS_not;
-            break;
-        case GT_OR:
-            ins = INS_or;
-            break;
-        case GT_ROL:
-            ins = INS_rol;
-            break;
-        case GT_ROR:
-            ins = INS_ror;
-            break;
-        case GT_RSH:
-            ins = INS_sar;
-            break;
-        case GT_RSZ:
-            ins = INS_shr;
-            break;
-        case GT_SUB:
-            ins = INS_sub;
-            break;
-        case GT_XOR:
-            ins = INS_xor;
-            break;
-#if !defined(TARGET_64BIT)
-        case GT_ADD_LO:
-            ins = INS_add;
-            break;
-        case GT_ADD_HI:
-            ins = INS_adc;
-            break;
-        case GT_SUB_LO:
-            ins = INS_sub;
-            break;
-        case GT_SUB_HI:
-            ins = INS_sbb;
-            break;
-        case GT_LSH_HI:
-            ins = INS_shld;
-            break;
-        case GT_RSH_LO:
-            ins = INS_shrd;
-            break;
-#endif // !defined(TARGET_64BIT)
-        default:
-            unreached();
-            break;
-    }
-    return ins;
-}
-
-//------------------------------------------------------------------------
-// genCodeForShift: Generates the code sequence for a GenTree node that
-// represents a bit shift or rotate operation (<<, >>, >>>, rol, ror).
-//
-// Arguments:
-//    tree - the bit shift node (that specifies the type of bit shift to perform).
-//
-// Assumptions:
-//    a) All GenTrees are register allocated.
-//    b) The shift-by-amount in tree->AsOp()->gtOp2 is either a contained constant or
-//       it's a register-allocated expression. If it is in a register that is
-//       not RCX, it will be moved to RCX (so RCX better not be in use!).
-//
-void CodeGen::genCodeForShift(GenTree* tree)
-{
-    // Only the non-RMW case here.
-    assert(tree->OperIsShiftOrRotate());
-    assert(tree->AsOp()->gtOp1->isUsedFromReg());
-    assert(tree->GetRegNum() != REG_NA);
-
-    genConsumeOperands(tree->AsOp());
-
-    var_types   targetType = tree->TypeGet();
-    instruction ins        = genGetInsForOper(tree->OperGet(), targetType);
-
-    GenTree*  operand    = tree->gtGetOp1();
-    regNumber operandReg = operand->GetRegNum();
-
-    GenTree* shiftBy = tree->gtGetOp2();
-
-    if (shiftBy->isContainedIntOrIImmed())
-    {
-        emitAttr size = emitTypeSize(tree);
-
-        // Optimize "X<<1" to "lea [reg+reg]" or "add reg, reg"
-        if (tree->OperIs(GT_LSH) && !tree->gtOverflowEx() && !tree->gtSetFlags() && shiftBy->IsIntegralConst(1))
-        {
-            if (tree->GetRegNum() == operandReg)
-            {
-                GetEmitter()->emitIns_R_R(INS_add, size, tree->GetRegNum(), operandReg);
-            }
-            else
-            {
-                GetEmitter()->emitIns_R_ARX(INS_lea, size, tree->GetRegNum(), operandReg, operandReg, 1, 0);
-            }
-        }
-        else
-        {
-            int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();
-
-#if defined(TARGET_64BIT)
-            // Try to emit rorx if BMI2 is available instead of mov+rol
-            // it makes sense only for 64bit integers
-            if ((genActualType(targetType) == TYP_LONG) && (tree->GetRegNum() != operandReg) &&
-                compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) && tree->OperIs(GT_ROL, GT_ROR) &&
-                (shiftByValue > 0) && (shiftByValue < 64))
-            {
-                const int value = tree->OperIs(GT_ROL) ? (64 - shiftByValue) : shiftByValue;
-                GetEmitter()->emitIns_R_R_I(INS_rorx, size, tree->GetRegNum(), operandReg, value);
-                genProduceReg(tree);
-                return;
-            }
-#endif
-            // First, move the operand to the destination register and
-            // later on perform the shift in-place.
-            // (LSRA will try to avoid this situation through preferencing.)
-            if (tree->GetRegNum() != operandReg)
-            {
-                inst_RV_RV(INS_mov, tree->GetRegNum(), operandReg, targetType);
-            }
-            inst_RV_SH(ins, size, tree->GetRegNum(), shiftByValue);
-        }
-    }
-    else
-    {
-        // We must have the number of bits to shift stored in ECX, since we constrained this node to
-        // sit in ECX. In case this didn't happen, LSRA expects the code generator to move it since it's a single
-        // register destination requirement.
-        genCopyRegIfNeeded(shiftBy, REG_RCX);
-
-        // The operand to be shifted must not be in ECX
-        noway_assert(operandReg != REG_RCX);
-
-        if (tree->GetRegNum() != operandReg)
-        {
-            inst_RV_RV(INS_mov, tree->GetRegNum(), operandReg, targetType);
-        }
-        inst_RV_CL(ins, tree->GetRegNum(), targetType);
-    }
-
-    genProduceReg(tree);
-}
-
-#ifdef TARGET_X86
-//------------------------------------------------------------------------
-// genCodeForShiftLong: Generates the code sequence for a GenTree node that
-// represents a three operand bit shift or rotate operation (<<Hi, >>Lo).
-//
-// Arguments:
-//    tree - the bit shift node (that specifies the type of bit shift to perform).
-//
-// Assumptions:
-//    a) All GenTrees are register allocated.
-//    b) The shift-by-amount in tree->AsOp()->gtOp2 is a contained constant
-//
-// TODO-X86-CQ: This only handles the case where the operand being shifted is in a register. We don't
-// need sourceHi to be always in reg in case of GT_LSH_HI (because it could be moved from memory to
-// targetReg if sourceHi is a memory operand). Similarly for GT_RSH_LO, sourceLo could be marked as
-// contained memory-op. Even if not a memory-op, we could mark it as reg-optional.
-//
-void CodeGen::genCodeForShiftLong(GenTree* tree)
-{
-    // Only the non-RMW case here.
-    genTreeOps oper = tree->OperGet();
-    assert(oper == GT_LSH_HI || oper == GT_RSH_LO);
-
-    GenTree* operand = tree->AsOp()->gtOp1;
-    assert(operand->OperGet() == GT_LONG);
-    assert(operand->AsOp()->gtOp1->isUsedFromReg());
-    assert(operand->AsOp()->gtOp2->isUsedFromReg());
-
-    GenTree* operandLo = operand->gtGetOp1();
-    GenTree* operandHi = operand->gtGetOp2();
-
-    regNumber regLo = operandLo->GetRegNum();
-    regNumber regHi = operandHi->GetRegNum();
-
-    genConsumeOperands(tree->AsOp());
-
-    var_types   targetType = tree->TypeGet();
-    instruction ins        = genGetInsForOper(oper, targetType);
-
-    GenTree* shiftBy = tree->gtGetOp2();
-
-    assert(shiftBy->isContainedIntOrIImmed());
-
-    unsigned int count = (unsigned int)shiftBy->AsIntConCommon()->IconValue();
-
-    regNumber regResult = (oper == GT_LSH_HI) ? regHi : regLo;
-
-    if (regResult != tree->GetRegNum())
-    {
-        inst_RV_RV(INS_mov, tree->GetRegNum(), regResult, targetType);
-    }
-
-    if (oper == GT_LSH_HI)
-    {
-        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->GetRegNum(), regLo, count);
-    }
-    else
-    {
-        assert(oper == GT_RSH_LO);
-        inst_RV_RV_IV(ins, emitTypeSize(targetType), tree->GetRegNum(), regHi, count);
-    }
-
-    genProduceReg(tree);
-}
-#endif
-
-//------------------------------------------------------------------------
-// genCodeForShiftRMW: Generates the code sequence for a GT_STOREIND GenTree node that
-// represents a RMW bit shift or rotate operation (<<, >>, >>>, rol, ror), for example:
-//      GT_STOREIND( AddressTree, GT_SHL( Ind ( AddressTree ), Operand ) )
-//
-// Arguments:
-//    storeIndNode: the GT_STOREIND node.
-//
-//void CodeGen::genCodeForShiftRMW(GenTreeStoreInd* storeInd)
-//{
-//    assert(false);
-//}
-
-//------------------------------------------------------------------------
-// genCodeForLclFld: Produce code for a GT_LCL_FLD node.
-//
-// Arguments:
-//    tree - the GT_LCL_FLD node
-//
-void CodeGen::genCodeForLclFld(GenTreeLclFld* tree)
-{
-    assert(tree->OperIs(GT_LCL_FLD));
-
-    var_types targetType = tree->TypeGet();
-    regNumber targetReg  = tree->GetRegNum();
-
-    noway_assert(targetReg != REG_NA);
-
-#ifdef FEATURE_SIMD
-    // Loading of TYP_SIMD12 (i.e. Vector3) field
-    if (targetType == TYP_SIMD12)
-    {
-        genLoadLclTypeSIMD12(tree);
-        return;
-    }
-#endif
-
-    noway_assert(targetType != TYP_STRUCT);
-
-    emitAttr size   = emitTypeSize(targetType);
-    unsigned offs   = tree->GetLclOffs();
-    unsigned varNum = tree->GetLclNum();
-    assert(varNum < compiler->lvaCount);
-
-    GetEmitter()->emitIns_R_S(ins_Load(targetType), size, targetReg, varNum, offs);
-
-    genProduceReg(tree);
-}
-
-//------------------------------------------------------------------------
-// genCodeForLclVar: Produce code for a GT_LCL_VAR node.
-//
-// Arguments:
-//    tree - the GT_LCL_VAR node
-//
-void CodeGen::genCodeForLclVar(GenTreeLclVar* tree)
-{
-    assert(tree->OperIs(GT_LCL_VAR));
-
-    // lcl_vars are not defs
-    assert((tree->gtFlags & GTF_VAR_DEF) == 0);
-
-    LclVarDsc* varDsc         = compiler->lvaGetDesc(tree);
-    bool       isRegCandidate = varDsc->lvIsRegCandidate();
-
-    // If this is a register candidate that has been spilled, genConsumeReg() will
-    // reload it at the point of use.  Otherwise, if it's not in a register, we load it here.
-
-    if (!isRegCandidate && !tree->IsMultiReg() && !(tree->gtFlags & GTF_SPILLED))
-    {
-#if defined(FEATURE_SIMD) && defined(TARGET_X86)
-        // Loading of TYP_SIMD12 (i.e. Vector3) variable
-        if (tree->TypeGet() == TYP_SIMD12)
-        {
-            genLoadLclTypeSIMD12(tree);
-            return;
-        }
-#endif // defined(FEATURE_SIMD) && defined(TARGET_X86)
-
-        var_types type = varDsc->GetRegisterType(tree);
-        GetEmitter()->emitIns_R_S(ins_Load(type, compiler->isSIMDTypeLocalAligned(tree->GetLclNum())),
-                                  emitTypeSize(type), tree->GetRegNum(), tree->GetLclNum(), 0);
-        genProduceReg(tree);
-    }
-}
-
-//------------------------------------------------------------------------
-// genCodeForStoreLclFld: Produce code for a GT_STORE_LCL_FLD node.
-//
-// Arguments:
-//    tree - the GT_STORE_LCL_FLD node
-//
-void CodeGen::genCodeForStoreLclFld(GenTreeLclFld* tree)
-{
-    assert(tree->OperIs(GT_STORE_LCL_FLD));
-
-    var_types targetType = tree->TypeGet();
-    GenTree*  op1        = tree->gtGetOp1();
-
-    noway_assert(targetType != TYP_STRUCT);
-
-#ifdef FEATURE_SIMD
-    // storing of TYP_SIMD12 (i.e. Vector3) field
-    if (tree->TypeGet() == TYP_SIMD12)
-    {
-        genStoreLclTypeSIMD12(tree);
-        return;
-    }
-#endif // FEATURE_SIMD
-
-    assert(varTypeUsesFloatReg(targetType) == varTypeUsesFloatReg(op1));
-    assert(genTypeSize(genActualType(targetType)) == genTypeSize(genActualType(op1->TypeGet())));
-
-    genConsumeRegs(op1);
-    GetEmitter()->emitInsBinary(ins_Store(targetType), emitTypeSize(tree), tree, op1);
-
-    // Updating variable liveness after instruction was emitted
-    genUpdateLife(tree);
-}
-
-//------------------------------------------------------------------------
-// genCodeForStoreLclVar: Produce code for a GT_STORE_LCL_VAR node.
-//
-// Arguments:
-//    lclNode - the GT_STORE_LCL_VAR node
-//
-void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode)
-{
-    assert(lclNode->OperIs(GT_STORE_LCL_VAR));
-
-    regNumber targetReg = lclNode->GetRegNum();
-    emitter*  emit      = GetEmitter();
-
-    GenTree* op1 = lclNode->gtGetOp1();
-
-    // Stores from a multi-reg source are handled separately.
-    if (op1->gtSkipReloadOrCopy()->IsMultiRegNode())
-    {
-        genMultiRegStoreToLocal(lclNode);
-    }
-    else
-    {
-        unsigned   lclNum = lclNode->GetLclNum();
-        LclVarDsc* varDsc = compiler->lvaGetDesc(lclNum);
-
-        var_types targetType = varDsc->GetRegisterType(lclNode);
-
-#ifdef DEBUG
-        var_types op1Type = op1->TypeGet();
-        if (op1Type == TYP_STRUCT)
-        {
-            assert(op1->IsLocal());
-            GenTreeLclVar* op1LclVar = op1->AsLclVar();
-            unsigned       op1lclNum = op1LclVar->GetLclNum();
-            LclVarDsc*     op1VarDsc = compiler->lvaGetDesc(op1lclNum);
-            op1Type                  = op1VarDsc->GetRegisterType(op1LclVar);
-        }
-        assert(varTypeUsesFloatReg(targetType) == varTypeUsesFloatReg(op1Type));
-        assert(!varTypeUsesFloatReg(targetType) || (emitTypeSize(targetType) == emitTypeSize(op1Type)));
-#endif
-
-#if !defined(TARGET_64BIT)
-        if (targetType == TYP_LONG)
-        {
-            genStoreLongLclVar(lclNode);
-            return;
-        }
-#endif // !defined(TARGET_64BIT)
-
-#ifdef FEATURE_SIMD
-        // storing of TYP_SIMD12 (i.e. Vector3) field
-        if (targetType == TYP_SIMD12)
-        {
-            genStoreLclTypeSIMD12(lclNode);
-            return;
-        }
-#endif // FEATURE_SIMD
-
-        genConsumeRegs(op1);
-
-        if (op1->OperIs(GT_BITCAST) && op1->isContained())
-        {
-            GenTree*  bitCastSrc = op1->gtGetOp1();
-            var_types srcType    = bitCastSrc->TypeGet();
-            noway_assert(!bitCastSrc->isContained());
-            if (targetReg == REG_NA)
-            {
-                emit->emitIns_S_R(ins_Store(srcType, compiler->isSIMDTypeLocalAligned(lclNum)),
-                                  emitTypeSize(targetType), bitCastSrc->GetRegNum(), lclNum, 0);
-                genUpdateLife(lclNode);
-                varDsc->SetRegNum(REG_STK);
-            }
-            else
-            {
-                genBitCast(targetType, targetReg, srcType, bitCastSrc->GetRegNum());
-            }
-        }
-        else if (targetReg == REG_NA)
-        {
-            // stack store
-            emit->emitInsStoreLcl(ins_Store(targetType, compiler->isSIMDTypeLocalAligned(lclNum)),
-                                  emitTypeSize(targetType), lclNode);
-            varDsc->SetRegNum(REG_STK);
-        }
-        else
-        {
-            // Look for the case where we have a constant zero which we've marked for reuse,
-            // but which isn't actually in the register we want.  In that case, it's better to create
-            // zero in the target register, because an xor is smaller than a copy. Note that we could
-            // potentially handle this in the register allocator, but we can't always catch it there
-            // because the target may not have a register allocated for it yet.
-            if (op1->isUsedFromReg() && (op1->GetRegNum() != targetReg) && (op1->IsIntegralConst(0) || op1->IsFPZero()))
-            {
-                op1->SetRegNum(REG_NA);
-                op1->ResetReuseRegVal();
-                op1->SetContained();
-            }
-
-            if (!op1->isUsedFromReg())
-            {
-                // Currently, we assume that the non-reg source of a GT_STORE_LCL_VAR writing to a register
-                // must be a constant. However, in the future we might want to support an operand used from
-                // memory.  This is a bit tricky because we have to decide it can be used from memory before
-                // register allocation,
-                // and this would be a case where, once that's done, we need to mark that node as always
-                // requiring a register - which we always assume now anyway, but once we "optimize" that
-                // we'll have to take cases like this into account.
-                assert((op1->GetRegNum() == REG_NA) && op1->OperIsConst());
-                genSetRegToConst(targetReg, targetType, op1);
-            }
-            else if (op1->GetRegNum() != targetReg)
-            {
-                assert(op1->GetRegNum() != REG_NA);
-                emit->emitInsBinary(ins_Move_Extend(targetType, true), emitTypeSize(lclNode), lclNode, op1);
-            }
-        }
-        if (targetReg != REG_NA)
-        {
-            genProduceReg(lclNode);
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// genCodeForIndexAddr: Produce code for a GT_INDEX_ADDR node.
-//
-// Arguments:
-//    tree - the GT_INDEX_ADDR node
-//
-void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// genCodeForIndir: Produce code for a GT_IND node.
-//
-// Arguments:
-//    tree - the GT_IND node
-//
-void CodeGen::genCodeForIndir(GenTreeIndir* tree)
-{
-    assert(tree->OperIs(GT_IND));
-
-#ifdef FEATURE_SIMD
-    // Handling of Vector3 type values loaded through indirection.
-    if (tree->TypeGet() == TYP_SIMD12)
-    {
-        genLoadIndTypeSIMD12(tree);
-        return;
-    }
-#endif // FEATURE_SIMD
-
-    var_types targetType = tree->TypeGet();
-    emitter*  emit       = GetEmitter();
-
-    GenTree* addr = tree->Addr();
-    if (addr->IsCnsIntOrI() && addr->IsIconHandle(GTF_ICON_TLS_HDL))
-    {
-        noway_assert(EA_ATTR(genTypeSize(targetType)) == EA_PTRSIZE);
-        emit->emitIns_R_C(ins_Load(TYP_I_IMPL), EA_PTRSIZE, tree->GetRegNum(), FLD_GLOBAL_FS,
-                          (int)addr->AsIntCon()->gtIconVal);
-    }
-    else
-    {
-        genConsumeAddress(addr);
-        emit->emitInsLoadInd(ins_Load(targetType), emitTypeSize(tree), tree->GetRegNum(), tree);
-    }
-
-    genProduceReg(tree);
-}
-
-//------------------------------------------------------------------------
-// genCodeForStoreInd: Produce code for a GT_STOREIND node.
-//
-// Arguments:
-//    tree - the GT_STOREIND node
-//
-void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// genCodeForSwap: Produce code for a GT_SWAP node.
-//
-// Arguments:
-//    tree - the GT_SWAP node
-//
-void CodeGen::genCodeForSwap(GenTreeOp* tree)
-{
-    assert(tree->OperIs(GT_SWAP));
-
-    // Swap is only supported for lclVar operands that are enregistered
-    // We do not consume or produce any registers.  Both operands remain enregistered.
-    // However, the gc-ness may change.
-    assert(genIsRegCandidateLocal(tree->gtOp1) && genIsRegCandidateLocal(tree->gtOp2));
-
-    GenTreeLclVarCommon* lcl1    = tree->gtOp1->AsLclVarCommon();
-    LclVarDsc*           varDsc1 = &(compiler->lvaTable[lcl1->GetLclNum()]);
-    var_types            type1   = varDsc1->TypeGet();
-    GenTreeLclVarCommon* lcl2    = tree->gtOp2->AsLclVarCommon();
-    LclVarDsc*           varDsc2 = &(compiler->lvaTable[lcl2->GetLclNum()]);
-    var_types            type2   = varDsc2->TypeGet();
-
-    // We must have both int or both fp regs
-    assert(!varTypeUsesFloatReg(type1) || varTypeUsesFloatReg(type2));
-
-    // FP swap is not yet implemented (and should have NYI'd in LSRA)
-    assert(!varTypeUsesFloatReg(type1));
-
-    regNumber oldOp1Reg     = lcl1->GetRegNum();
-    regMaskTP oldOp1RegMask = genRegMask(oldOp1Reg);
-    regNumber oldOp2Reg     = lcl2->GetRegNum();
-    regMaskTP oldOp2RegMask = genRegMask(oldOp2Reg);
-
-    // We don't call genUpdateVarReg because we don't have a tree node with the new register.
-    varDsc1->SetRegNum(oldOp2Reg);
-    varDsc2->SetRegNum(oldOp1Reg);
-
-    // Do the xchg
-    emitAttr size = EA_PTRSIZE;
-    if (varTypeGCtype(type1) != varTypeGCtype(type2))
-    {
-        // If the type specified to the emitter is a GC type, it will swap the GC-ness of the registers.
-        // Otherwise it will leave them alone, which is correct if they have the same GC-ness.
-        size = EA_GCREF;
-    }
-    inst_RV_RV(INS_xchg, oldOp1Reg, oldOp2Reg, TYP_I_IMPL, size);
-
-    // Update the gcInfo.
-    // Manually remove these regs for the gc sets (mostly to avoid confusing duplicative dump output)
-    gcInfo.gcRegByrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
-    gcInfo.gcRegGCrefSetCur &= ~(oldOp1RegMask | oldOp2RegMask);
-
-    // gcMarkRegPtrVal will do the appropriate thing for non-gc types.
-    // It will also dump the updates.
-    gcInfo.gcMarkRegPtrVal(oldOp2Reg, type1);
-    gcInfo.gcMarkRegPtrVal(oldOp1Reg, type2);
-}
-
-//------------------------------------------------------------------------
-// genEmitOptimizedGCWriteBarrier: Generate write barrier store using the optimized
-// helper functions.
-//
-// Arguments:
-//    writeBarrierForm - the write barrier form to use
-//    addr - the address at which to do the store
-//    data - the data to store
-//
-// Return Value:
-//    true if an optimized write barrier form was used, false if not. If this
-//    function returns false, the caller must emit a "standard" write barrier.
-
-bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarrierForm, GenTree* addr, GenTree* data)
-{
-    assert(writeBarrierForm != GCInfo::WBF_NoBarrier);
-
-#if defined(TARGET_X86) && NOGC_WRITE_BARRIERS
-    if (!genUseOptimizedWriteBarriers(writeBarrierForm))
-    {
-        return false;
-    }
-
-    const static int regToHelper[2][8] = {
-        // If the target is known to be in managed memory
-        {
-            CORINFO_HELP_ASSIGN_REF_EAX, // EAX
-            CORINFO_HELP_ASSIGN_REF_ECX, // ECX
-            -1,                          // EDX (always the target address)
-            CORINFO_HELP_ASSIGN_REF_EBX, // EBX
-            -1,                          // ESP
-            CORINFO_HELP_ASSIGN_REF_EBP, // EBP
-            CORINFO_HELP_ASSIGN_REF_ESI, // ESI
-            CORINFO_HELP_ASSIGN_REF_EDI, // EDI
-        },
-
-        // Don't know if the target is in managed memory
-        {
-            CORINFO_HELP_CHECKED_ASSIGN_REF_EAX, // EAX
-            CORINFO_HELP_CHECKED_ASSIGN_REF_ECX, // ECX
-            -1,                                  // EDX (always the target address)
-            CORINFO_HELP_CHECKED_ASSIGN_REF_EBX, // EBX
-            -1,                                  // ESP
-            CORINFO_HELP_CHECKED_ASSIGN_REF_EBP, // EBP
-            CORINFO_HELP_CHECKED_ASSIGN_REF_ESI, // ESI
-            CORINFO_HELP_CHECKED_ASSIGN_REF_EDI, // EDI
-        },
-    };
-
-    noway_assert(regToHelper[0][REG_EAX] == CORINFO_HELP_ASSIGN_REF_EAX);
-    noway_assert(regToHelper[0][REG_ECX] == CORINFO_HELP_ASSIGN_REF_ECX);
-    noway_assert(regToHelper[0][REG_EBX] == CORINFO_HELP_ASSIGN_REF_EBX);
-    noway_assert(regToHelper[0][REG_ESP] == -1);
-    noway_assert(regToHelper[0][REG_EBP] == CORINFO_HELP_ASSIGN_REF_EBP);
-    noway_assert(regToHelper[0][REG_ESI] == CORINFO_HELP_ASSIGN_REF_ESI);
-    noway_assert(regToHelper[0][REG_EDI] == CORINFO_HELP_ASSIGN_REF_EDI);
-
-    noway_assert(regToHelper[1][REG_EAX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EAX);
-    noway_assert(regToHelper[1][REG_ECX] == CORINFO_HELP_CHECKED_ASSIGN_REF_ECX);
-    noway_assert(regToHelper[1][REG_EBX] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBX);
-    noway_assert(regToHelper[1][REG_ESP] == -1);
-    noway_assert(regToHelper[1][REG_EBP] == CORINFO_HELP_CHECKED_ASSIGN_REF_EBP);
-    noway_assert(regToHelper[1][REG_ESI] == CORINFO_HELP_CHECKED_ASSIGN_REF_ESI);
-    noway_assert(regToHelper[1][REG_EDI] == CORINFO_HELP_CHECKED_ASSIGN_REF_EDI);
-
-    regNumber reg = data->GetRegNum();
-    noway_assert((reg != REG_ESP) && (reg != REG_WRITE_BARRIER));
-
-    // Generate the following code:
-    //            lea     edx, addr
-    //            call    write_barrier_helper_reg
-
-    // addr goes in REG_ARG_0
-    genCopyRegIfNeeded(addr, REG_WRITE_BARRIER);
-
-    unsigned tgtAnywhere = 0;
-    if (writeBarrierForm != GCInfo::WBF_BarrierUnchecked)
-    {
-        tgtAnywhere = 1;
-    }
-
-    // We might want to call a modified version of genGCWriteBarrier() to get the benefit of
-    // the FEATURE_COUNT_GC_WRITE_BARRIERS code there, but that code doesn't look like it works
-    // with rationalized RyuJIT IR. So, for now, just emit the helper call directly here.
-
-    genEmitHelperCall(regToHelper[tgtAnywhere][reg],
-                      0,           // argSize
-                      EA_PTRSIZE); // retSize
-
-    return true;
-#else  // !defined(TARGET_X86) || !NOGC_WRITE_BARRIERS
-    return false;
-#endif // !defined(TARGET_X86) || !NOGC_WRITE_BARRIERS
-}
-
-// Produce code for a GT_CALL node
-void CodeGen::genCallInstruction(GenTreeCall* call)
-{
-    assert(false);
-
-}
-
-// Produce code for a GT_JMP node.
-// The arguments of the caller needs to be transferred to the callee before exiting caller.
-// The actual jump to callee is generated as part of caller epilog sequence.
-// Therefore the codegen of GT_JMP is to ensure that the callee arguments are correctly setup.
-void CodeGen::genJmpMethod(GenTree* jmp)
-{
-    assert(jmp->OperGet() == GT_JMP);
-    assert(compiler->compJmpOpUsed);
-
-    // If no arguments, nothing to do
-    if (compiler->info.compArgsCount == 0)
-    {
-        return;
-    }
-
-    // Make sure register arguments are in their initial registers
-    // and stack arguments are put back as well.
-    unsigned   varNum;
-    LclVarDsc* varDsc;
-
-    // First move any en-registered stack arguments back to the stack.
-    // At the same time any reg arg not in correct reg is moved back to its stack location.
-    //
-    // We are not strictly required to spill reg args that are not in the desired reg for a jmp call
-    // But that would require us to deal with circularity while moving values around.  Spilling
-    // to stack makes the implementation simple, which is not a bad trade off given Jmp calls
-    // are not frequent.
-    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
-    {
-        varDsc = compiler->lvaTable + varNum;
-
-        if (varDsc->lvPromoted)
-        {
-            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
-
-            unsigned fieldVarNum = varDsc->lvFieldLclStart;
-            varDsc               = compiler->lvaTable + fieldVarNum;
-        }
-        noway_assert(varDsc->lvIsParam);
-
-        if (varDsc->lvIsRegArg && (varDsc->GetRegNum() != REG_STK))
-        {
-            // Skip reg args which are already in its right register for jmp call.
-            // If not, we will spill such args to their stack locations.
-            //
-            // If we need to generate a tail call profiler hook, then spill all
-            // arg regs to free them up for the callback.
-            if (!compiler->compIsProfilerHookNeeded() && (varDsc->GetRegNum() == varDsc->GetArgReg()))
-            {
-                continue;
-            }
-        }
-        else if (varDsc->GetRegNum() == REG_STK)
-        {
-            // Skip args which are currently living in stack.
-            continue;
-        }
-
-        // If we came here it means either a reg argument not in the right register or
-        // a stack argument currently living in a register.  In either case the following
-        // assert should hold.
-        assert(varDsc->GetRegNum() != REG_STK);
-
-        assert(!varDsc->lvIsStructField || (compiler->lvaTable[varDsc->lvParentLcl].lvFieldCnt == 1));
-        var_types storeType = genActualType(varDsc->lvaArgType()); // We own the memory and can use the full move.
-        GetEmitter()->emitIns_S_R(ins_Store(storeType), emitTypeSize(storeType), varDsc->GetRegNum(), varNum, 0);
-
-        // Update lvRegNum life and GC info to indicate lvRegNum is dead and varDsc stack slot is going live.
-        // Note that we cannot modify varDsc->GetRegNum() here because another basic block may not be expecting it.
-        // Therefore manually update life of varDsc->GetRegNum().
-        regMaskTP tempMask = varDsc->lvRegMask();
-        regSet.RemoveMaskVars(tempMask);
-        gcInfo.gcMarkRegSetNpt(tempMask);
-        if (compiler->lvaIsGCTracked(varDsc))
-        {
-#ifdef DEBUG
-            if (!VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
-            {
-                JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming live\n", varNum);
-            }
-            else
-            {
-                JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing live\n", varNum);
-            }
-#endif // DEBUG
-
-            VarSetOps::AddElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
-        }
-    }
-
-#ifdef PROFILING_SUPPORTED
-    // At this point all arg regs are free.
-    // Emit tail call profiler callback.
-    genProfilingLeaveCallback(CORINFO_HELP_PROF_FCN_TAILCALL);
-#endif
-
-    // Next move any un-enregistered register arguments back to their register.
-    regMaskTP fixedIntArgMask = RBM_NONE;    // tracks the int arg regs occupying fixed args in case of a vararg method.
-    unsigned  firstArgVarNum  = BAD_VAR_NUM; // varNum of the first argument in case of a vararg method.
-    for (varNum = 0; (varNum < compiler->info.compArgsCount); varNum++)
-    {
-        varDsc = compiler->lvaTable + varNum;
-        if (varDsc->lvPromoted)
-        {
-            noway_assert(varDsc->lvFieldCnt == 1); // We only handle one field here
-
-            unsigned fieldVarNum = varDsc->lvFieldLclStart;
-            varDsc               = compiler->lvaTable + fieldVarNum;
-        }
-        noway_assert(varDsc->lvIsParam);
-
-        // Skip if arg not passed in a register.
-        if (!varDsc->lvIsRegArg)
-        {
-            continue;
-        }
-
-#if defined(UNIX_AMD64_ABI)
-        if (varTypeIsStruct(varDsc))
-        {
-            CORINFO_CLASS_HANDLE typeHnd = varDsc->GetStructHnd();
-            assert(typeHnd != nullptr);
-
-            SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
-            compiler->eeGetSystemVAmd64PassStructInRegisterDescriptor(typeHnd, &structDesc);
-            assert(structDesc.passedInRegisters);
-
-            unsigned __int8 offset0 = 0;
-            unsigned __int8 offset1 = 0;
-            var_types       type0   = TYP_UNKNOWN;
-            var_types       type1   = TYP_UNKNOWN;
-
-            // Get the eightbyte data
-            compiler->GetStructTypeOffset(structDesc, &type0, &type1, &offset0, &offset1);
-
-            // Move the values into the right registers.
-            //
-
-            // Update varDsc->GetArgReg() and lvOtherArgReg life and GC Info to indicate varDsc stack slot is dead and
-            // argReg is going live. Note that we cannot modify varDsc->GetRegNum() and lvOtherArgReg here
-            // because another basic block may not be expecting it.
-            // Therefore manually update life of argReg.  Note that GT_JMP marks
-            // the end of the basic block and after which reg life and gc info will be recomputed for the new block in
-            // genCodeForBBList().
-            if (type0 != TYP_UNKNOWN)
-            {
-                GetEmitter()->emitIns_R_S(ins_Load(type0), emitTypeSize(type0), varDsc->GetArgReg(), varNum, offset0);
-                regSet.SetMaskVars(regSet.GetMaskVars() | genRegMask(varDsc->GetArgReg()));
-                gcInfo.gcMarkRegPtrVal(varDsc->GetArgReg(), type0);
-            }
-
-            if (type1 != TYP_UNKNOWN)
-            {
-                GetEmitter()->emitIns_R_S(ins_Load(type1), emitTypeSize(type1), varDsc->GetOtherArgReg(), varNum,
-                                          offset1);
-                regSet.SetMaskVars(regSet.GetMaskVars() | genRegMask(varDsc->GetOtherArgReg()));
-                gcInfo.gcMarkRegPtrVal(varDsc->GetOtherArgReg(), type1);
-            }
-
-            if (varDsc->lvTracked)
-            {
-                VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
-            }
-        }
-        else
-#endif // !defined(UNIX_AMD64_ABI)
-        {
-            // Register argument
-            CLANG_FORMAT_COMMENT_ANCHOR;
-#ifdef TARGET_X86
-            noway_assert(
-                isRegParamType(genActualType(varDsc->TypeGet())) ||
-                (varTypeIsStruct(varDsc->TypeGet()) && compiler->isTrivialPointerSizedStruct(varDsc->GetStructHnd())));
-#else
-            noway_assert(isRegParamType(genActualType(varDsc->TypeGet())));
-#endif // TARGET_X86
-
-            // Is register argument already in the right register?
-            // If not load it from its stack location.
-            var_types loadType = varDsc->lvaArgType();
-
-#ifdef TARGET_X86
-            if (varTypeIsStruct(varDsc->TypeGet()))
-            {
-                // Treat trivial pointer-sized structs as a pointer sized primitive
-                // for the purposes of registers.
-                loadType = TYP_I_IMPL;
-            }
-#endif
-
-            regNumber argReg = varDsc->GetArgReg(); // incoming arg register
-
-            if (varDsc->GetRegNum() != argReg)
-            {
-                assert(genIsValidReg(argReg));
-                GetEmitter()->emitIns_R_S(ins_Load(loadType), emitTypeSize(loadType), argReg, varNum, 0);
-
-                // Update argReg life and GC Info to indicate varDsc stack slot is dead and argReg is going live.
-                // Note that we cannot modify varDsc->GetRegNum() here because another basic block may not be
-                // expecting it. Therefore manually update life of argReg.  Note that GT_JMP marks the end of the
-                // basic block and after which reg life and gc info will be recomputed for the new block in
-                // genCodeForBBList().
-                regSet.AddMaskVars(genRegMask(argReg));
-                gcInfo.gcMarkRegPtrVal(argReg, loadType);
-                if (compiler->lvaIsGCTracked(varDsc))
-                {
-#ifdef DEBUG
-                    if (VarSetOps::IsMember(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex))
-                    {
-                        JITDUMP("\t\t\t\t\t\t\tVar V%02u becoming dead\n", varNum);
-                    }
-                    else
-                    {
-                        JITDUMP("\t\t\t\t\t\t\tVar V%02u continuing dead\n", varNum);
-                    }
-#endif // DEBUG
-
-                    VarSetOps::RemoveElemD(compiler, gcInfo.gcVarPtrSetCur, varDsc->lvVarIndex);
-                }
-            }
-        }
-
-#if FEATURE_VARARG && defined(TARGET_AMD64)
-        // In case of a jmp call to a vararg method also pass the float/double arg in the corresponding int arg
-        // register. This is due to the AMD64 ABI which requires floating point values passed to varargs functions to
-        // be passed in both integer and floating point registers. It doesn't apply to x86, which passes floating point
-        // values on the stack.
-        if (compiler->info.compIsVarArgs)
-        {
-            regNumber intArgReg;
-            var_types loadType = varDsc->lvaArgType();
-            regNumber argReg   = varDsc->GetArgReg(); // incoming arg register
-
-            if (varTypeIsFloating(loadType))
-            {
-                intArgReg       = compiler->getCallArgIntRegister(argReg);
-                instruction ins = ins_CopyFloatToInt(loadType, TYP_LONG);
-                inst_RV_RV(ins, argReg, intArgReg, loadType);
-            }
-            else
-            {
-                intArgReg = argReg;
-            }
-
-            fixedIntArgMask |= genRegMask(intArgReg);
-
-            if (intArgReg == REG_ARG_0)
-            {
-                assert(firstArgVarNum == BAD_VAR_NUM);
-                firstArgVarNum = varNum;
-            }
-        }
-#endif // FEATURE_VARARG
-    }
-
-#if FEATURE_VARARG && defined(TARGET_AMD64)
-    // Jmp call to a vararg method - if the method has fewer than 4 fixed arguments,
-    // load the remaining arg registers (both int and float) from the corresponding
-    // shadow stack slots.  This is for the reason that we don't know the number and type
-    // of non-fixed params passed by the caller, therefore we have to assume the worst case
-    // of caller passing float/double args both in int and float arg regs.
-    //
-    // This doesn't apply to x86, which doesn't pass floating point values in floating
-    // point registers.
-    //
-    // The caller could have passed gc-ref/byref type var args.  Since these are var args
-    // the callee no way of knowing their gc-ness.  Therefore, mark the region that loads
-    // remaining arg registers from shadow stack slots as non-gc interruptible.
-    if (fixedIntArgMask != RBM_NONE)
-    {
-        assert(compiler->info.compIsVarArgs);
-        assert(firstArgVarNum != BAD_VAR_NUM);
-
-        regMaskTP remainingIntArgMask = RBM_ARG_REGS & ~fixedIntArgMask;
-        if (remainingIntArgMask != RBM_NONE)
-        {
-            instruction insCopyIntToFloat = ins_CopyIntToFloat(TYP_LONG, TYP_DOUBLE);
-            GetEmitter()->emitDisableGC();
-            for (int argNum = 0, argOffset = 0; argNum < MAX_REG_ARG; ++argNum)
-            {
-                regNumber argReg     = intArgRegs[argNum];
-                regMaskTP argRegMask = genRegMask(argReg);
-
-                if ((remainingIntArgMask & argRegMask) != 0)
-                {
-                    remainingIntArgMask &= ~argRegMask;
-                    GetEmitter()->emitIns_R_S(INS_mov, EA_8BYTE, argReg, firstArgVarNum, argOffset);
-
-                    // also load it in corresponding float arg reg
-                    regNumber floatReg = compiler->getCallArgFloatRegister(argReg);
-                    inst_RV_RV(insCopyIntToFloat, floatReg, argReg);
-                }
-
-                argOffset += REGSIZE_BYTES;
-            }
-            GetEmitter()->emitEnableGC();
-        }
-    }
-#endif // FEATURE_VARARG
-}
-
-// produce code for a GT_LEA subnode
-void CodeGen::genLeaInstruction(GenTreeAddrMode* lea)
-{
-    emitAttr size = emitTypeSize(lea);
-    genConsumeOperands(lea);
-
-    if (lea->Base() && lea->Index())
-    {
-        regNumber baseReg  = lea->Base()->GetRegNum();
-        regNumber indexReg = lea->Index()->GetRegNum();
-        GetEmitter()->emitIns_R_ARX(INS_lea, size, lea->GetRegNum(), baseReg, indexReg, lea->gtScale, lea->Offset());
-    }
-    else if (lea->Base())
-    {
-        GetEmitter()->emitIns_R_AR(INS_lea, size, lea->GetRegNum(), lea->Base()->GetRegNum(), lea->Offset());
-    }
-    else if (lea->Index())
-    {
-        GetEmitter()->emitIns_R_ARX(INS_lea, size, lea->GetRegNum(), REG_NA, lea->Index()->GetRegNum(), lea->gtScale,
-                                    lea->Offset());
-    }
-
-    genProduceReg(lea);
-}
-
-//------------------------------------------------------------------------
-// genCompareFloat: Generate code for comparing two floating point values
-//
-// Arguments:
-//    treeNode - the compare tree
-//
-void CodeGen::genCompareFloat(GenTree* treeNode)
-{
-    assert(treeNode->OperIsCompare());
-
-    GenTreeOp* tree    = treeNode->AsOp();
-    GenTree*   op1     = tree->gtOp1;
-    GenTree*   op2     = tree->gtOp2;
-    var_types  op1Type = op1->TypeGet();
-    var_types  op2Type = op2->TypeGet();
-
-    genConsumeOperands(tree);
-
-    assert(varTypeIsFloating(op1Type));
-    assert(op1Type == op2Type);
-
-    regNumber   targetReg = treeNode->GetRegNum();
-    instruction ins;
-    emitAttr    cmpAttr;
-
-    GenCondition condition = GenCondition::FromFloatRelop(treeNode);
-
-    if (condition.PreferSwap())
-    {
-        condition = GenCondition::Swap(condition);
-        std::swap(op1, op2);
-    }
-
-    ins     = ins_FloatCompare(op1Type);
-    cmpAttr = emitTypeSize(op1Type);
-
-    GetEmitter()->emitInsBinary(ins, cmpAttr, op1, op2);
-
-    // Are we evaluating this into a register?
-    if (targetReg != REG_NA)
-    {
-        if ((condition.GetCode() == GenCondition::FNEU) && (op1->GetRegNum() == op2->GetRegNum()))
-        {
-            // For floating point, `x != x` is a common way of
-            // checking for NaN. So, in the case where both
-            // operands are the same, we can optimize codegen
-            // to only do a single check.
-
-            condition = GenCondition(GenCondition::P);
-        }
-
-        inst_SETCC(condition, treeNode->TypeGet(), targetReg);
-        genProduceReg(tree);
-    }
-}
-
-//------------------------------------------------------------------------
-// genCompareInt: Generate code for comparing ints or, on amd64, longs.
-//
-// Arguments:
-//    treeNode - the compare tree
-//
-// Return Value:
-//    None.
-void CodeGen::genCompareInt(GenTree* treeNode)
-{
-    assert(treeNode->OperIsCompare() || treeNode->OperIs(GT_CMP));
-
-    GenTreeOp* tree          = treeNode->AsOp();
-    GenTree*   op1           = tree->gtOp1;
-    GenTree*   op2           = tree->gtOp2;
-    var_types  op1Type       = op1->TypeGet();
-    var_types  op2Type       = op2->TypeGet();
-    regNumber  targetReg     = tree->GetRegNum();
-    emitter*   emit          = GetEmitter();
-    bool       canReuseFlags = false;
-
-    genConsumeOperands(tree);
-
-    assert(!op1->isContainedIntOrIImmed());
-    assert(!varTypeIsFloating(op2Type));
-
-    instruction ins;
-    var_types   type = TYP_UNKNOWN;
-
-    if (tree->OperIs(GT_TEST_EQ, GT_TEST_NE))
-    {
-        ins = INS_test;
-
-        // Unlike many xarch instructions TEST doesn't have a form with a 16/32/64 bit first operand and
-        // an 8 bit immediate second operand. But if the immediate value fits in 8 bits then we can simply
-        // emit a 8 bit TEST instruction, unless we're targeting x86 and the first operand is a non-byteable
-        // register.
-        // Note that lowering does something similar but its main purpose is to allow memory operands to be
-        // contained so it doesn't handle other kind of operands. It could do more but on x86 that results
-        // in additional register constrains and that may be worse than wasting 3 bytes on an immediate.
-        if (
-#ifdef TARGET_X86
-            (!op1->isUsedFromReg() || isByteReg(op1->GetRegNum())) &&
-#endif
-            (op2->IsCnsIntOrI() && genSmallTypeCanRepresentValue(TYP_UBYTE, op2->AsIntCon()->IconValue())))
-        {
-            type = TYP_UBYTE;
-        }
-    }
-    else if (op1->isUsedFromReg() && op2->IsIntegralConst(0))
-    {
-        if (compiler->opts.OptimizationEnabled())
-        {
-            emitAttr op1Size = emitActualTypeSize(op1->TypeGet());
-            assert((int)op1Size >= 4);
-
-            // Optimize "x<0" and "x>=0" to "x>>31" if "x" is not a jump condition and in a reg.
-            // Morph/Lowering are responsible to rotate "0<x" to "x>0" so we won't handle it here.
-            if ((targetReg != REG_NA) && tree->OperIs(GT_LT, GT_GE) && !tree->IsUnsigned())
-            {
-                if (targetReg != op1->GetRegNum())
-                {
-                    inst_RV_RV(INS_mov, targetReg, op1->GetRegNum(), op1->TypeGet());
-                }
-                if (tree->OperIs(GT_GE))
-                {
-                    // emit "not" for "x>=0" case
-                    inst_RV(INS_not, targetReg, op1->TypeGet());
-                }
-                inst_RV_IV(INS_shr_N, targetReg, (int)op1Size * 8 - 1, op1Size);
-                genProduceReg(tree);
-                return;
-            }
-            canReuseFlags = true;
-        }
-
-        // We're comparing a register to 0 so we can generate "test reg1, reg1"
-        // instead of the longer "cmp reg1, 0"
-        ins = INS_test;
-        op2 = op1;
-    }
-    else
-    {
-        ins = INS_cmp;
-    }
-
-    if (type == TYP_UNKNOWN)
-    {
-        if (op1Type == op2Type)
-        {
-            type = op1Type;
-        }
-        else if (genTypeSize(op1Type) == genTypeSize(op2Type))
-        {
-            // If the types are different but have the same size then we'll use TYP_INT or TYP_LONG.
-            // This primarily deals with small type mixes (e.g. byte/ubyte) that need to be widened
-            // and compared as int. We should not get long type mixes here but handle that as well
-            // just in case.
-            type = genTypeSize(op1Type) == 8 ? TYP_LONG : TYP_INT;
-        }
-        else
-        {
-            // In the types are different simply use TYP_INT. This deals with small type/int type
-            // mixes (e.g. byte/short ubyte/int) that need to be widened and compared as int.
-            // Lowering is expected to handle any mixes that involve long types (e.g. int/long).
-            type = TYP_INT;
-        }
-
-        // The common type cannot be smaller than any of the operand types, we're probably mixing int/long
-        assert(genTypeSize(type) >= max(genTypeSize(op1Type), genTypeSize(op2Type)));
-        // Small unsigned int types (TYP_BOOL can use anything) should use unsigned comparisons
-        assert(!(varTypeIsSmallInt(type) && varTypeIsUnsigned(type)) || ((tree->gtFlags & GTF_UNSIGNED) != 0));
-        // If op1 is smaller then it cannot be in memory, we're probably missing a cast
-        assert((genTypeSize(op1Type) >= genTypeSize(type)) || !op1->isUsedFromMemory());
-        // If op2 is smaller then it cannot be in memory, we're probably missing a cast
-        assert((genTypeSize(op2Type) >= genTypeSize(type)) || !op2->isUsedFromMemory());
-        // If we ended up with a small type and op2 is a constant then make sure we don't lose constant bits
-        assert(!op2->IsCnsIntOrI() || !varTypeIsSmall(type) ||
-               genSmallTypeCanRepresentValue(type, op2->AsIntCon()->IconValue()));
-    }
-
-    // The type cannot be larger than the machine word size
-    assert(genTypeSize(type) <= genTypeSize(TYP_I_IMPL));
-    // TYP_UINT and TYP_ULONG should not appear here, only small types can be unsigned
-    assert(!varTypeIsUnsigned(type) || varTypeIsSmall(type));
-
-    bool needsOCFlags = !tree->OperIs(GT_EQ, GT_NE);
-    if (canReuseFlags && emit->AreFlagsSetToZeroCmp(op1->GetRegNum(), emitTypeSize(type), needsOCFlags))
-    {
-        JITDUMP("Not emitting compare due to flags being already set\n");
-    }
-    else
-    {
-        emit->emitInsBinary(ins, emitTypeSize(type), op1, op2);
-    }
-
-    // Are we evaluating this into a register?
-    if (targetReg != REG_NA)
-    {
-        inst_SETCC(GenCondition::FromIntegralRelop(tree), tree->TypeGet(), targetReg);
-        genProduceReg(tree);
-    }
-}
-
-#if !defined(TARGET_64BIT)
-//------------------------------------------------------------------------
-// genLongToIntCast: Generate code for long to int casts on x86.
-//
-// Arguments:
-//    cast - The GT_CAST node
-//
-// Return Value:
-//    None.
-//
-// Assumptions:
-//    The cast node and its sources (via GT_LONG) must have been assigned registers.
-//    The destination cannot be a floating point type or a small integer type.
-//
-void CodeGen::genLongToIntCast(GenTree* cast)
-{
-    assert(cast->OperGet() == GT_CAST);
-
-    GenTree* src = cast->gtGetOp1();
-    noway_assert(src->OperGet() == GT_LONG);
-
-    genConsumeRegs(src);
-
-    var_types srcType  = ((cast->gtFlags & GTF_UNSIGNED) != 0) ? TYP_ULONG : TYP_LONG;
-    var_types dstType  = cast->CastToType();
-    regNumber loSrcReg = src->gtGetOp1()->GetRegNum();
-    regNumber hiSrcReg = src->gtGetOp2()->GetRegNum();
-    regNumber dstReg   = cast->GetRegNum();
-
-    assert((dstType == TYP_INT) || (dstType == TYP_UINT));
-    assert(genIsValidIntReg(loSrcReg));
-    assert(genIsValidIntReg(hiSrcReg));
-    assert(genIsValidIntReg(dstReg));
-
-    if (cast->gtOverflow())
-    {
-        //
-        // Generate an overflow check for [u]long to [u]int casts:
-        //
-        // long  -> int  - check if the upper 33 bits are all 0 or all 1
-        //
-        // ulong -> int  - check if the upper 33 bits are all 0
-        //
-        // long  -> uint - check if the upper 32 bits are all 0
-        // ulong -> uint - check if the upper 32 bits are all 0
-        //
-
-        if ((srcType == TYP_LONG) && (dstType == TYP_INT))
-        {
-            BasicBlock* allOne  = genCreateTempLabel();
-            BasicBlock* success = genCreateTempLabel();
-
-            inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
-            inst_JMP(EJ_js, allOne);
-
-            inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
-            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
-            inst_JMP(EJ_jmp, success);
-
-            genDefineTempLabel(allOne);
-            inst_RV_IV(INS_cmp, hiSrcReg, -1, EA_4BYTE);
-            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
-
-            genDefineTempLabel(success);
-        }
-        else
-        {
-            if ((srcType == TYP_ULONG) && (dstType == TYP_INT))
-            {
-                inst_RV_RV(INS_test, loSrcReg, loSrcReg, TYP_INT, EA_4BYTE);
-                genJumpToThrowHlpBlk(EJ_js, SCK_OVERFLOW);
-            }
-
-            inst_RV_RV(INS_test, hiSrcReg, hiSrcReg, TYP_INT, EA_4BYTE);
-            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
-        }
-    }
-
-    if (dstReg != loSrcReg)
-    {
-        inst_RV_RV(INS_mov, dstReg, loSrcReg, TYP_INT, EA_4BYTE);
-    }
-
-    genProduceReg(cast);
-}
-#endif
-
-//------------------------------------------------------------------------
-// genIntCastOverflowCheck: Generate overflow checking code for an integer cast.
-//
-// Arguments:
-//    cast - The GT_CAST node
-//    desc - The cast description
-//    reg  - The register containing the value to check
-//
-void CodeGen::genIntCastOverflowCheck(GenTreeCast* cast, const GenIntCastDesc& desc, regNumber reg)
-{
-    switch (desc.CheckKind())
-    {
-        case GenIntCastDesc::CHECK_POSITIVE:
-            GetEmitter()->emitIns_R_R(INS_test, EA_SIZE(desc.CheckSrcSize()), reg, reg);
-            genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
-            break;
-
-#ifdef TARGET_64BIT
-        case GenIntCastDesc::CHECK_UINT_RANGE:
-        {
-            // We need to check if the value is not greater than 0xFFFFFFFF but this value
-            // cannot be encoded in an immediate operand. Use a right shift to test if the
-            // upper 32 bits are zero. This requires a temporary register.
-            const regNumber tempReg = cast->GetSingleTempReg();
-            assert(tempReg != reg);
-            GetEmitter()->emitIns_R_R(INS_mov, EA_8BYTE, tempReg, reg);
-            GetEmitter()->emitIns_R_I(INS_shr_N, EA_8BYTE, tempReg, 32);
-            genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
-        }
-        break;
-
-        case GenIntCastDesc::CHECK_POSITIVE_INT_RANGE:
-            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
-            genJumpToThrowHlpBlk(EJ_ja, SCK_OVERFLOW);
-            break;
-
-        case GenIntCastDesc::CHECK_INT_RANGE:
-            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MAX);
-            genJumpToThrowHlpBlk(EJ_jg, SCK_OVERFLOW);
-            GetEmitter()->emitIns_R_I(INS_cmp, EA_8BYTE, reg, INT32_MIN);
-            genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
-            break;
-#endif
-
-        default:
-        {
-            assert(desc.CheckKind() == GenIntCastDesc::CHECK_SMALL_INT_RANGE);
-            const int castMaxValue = desc.CheckSmallIntMax();
-            const int castMinValue = desc.CheckSmallIntMin();
-
-            GetEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMaxValue);
-            genJumpToThrowHlpBlk((castMinValue == 0) ? EJ_ja : EJ_jg, SCK_OVERFLOW);
-
-            if (castMinValue != 0)
-            {
-                GetEmitter()->emitIns_R_I(INS_cmp, EA_SIZE(desc.CheckSrcSize()), reg, castMinValue);
-                genJumpToThrowHlpBlk(EJ_jl, SCK_OVERFLOW);
-            }
-        }
-        break;
-    }
-}
-
-//------------------------------------------------------------------------
-// genIntToIntCast: Generate code for an integer cast, with or without overflow check.
-//
-// Arguments:
-//    cast - The GT_CAST node
-//
-// Assumptions:
-//    The cast node is not a contained node and must have an assigned register.
-//    Neither the source nor target type can be a floating point type.
-//    On x86 casts to (U)BYTE require that the source be in a byte register.
-//
-// TODO-XArch-CQ: Allow castOp to be a contained node without an assigned register.
-//
-void CodeGen::genIntToIntCast(GenTreeCast* cast)
-{
-    genConsumeRegs(cast->gtGetOp1());
-
-    const regNumber srcReg = cast->gtGetOp1()->GetRegNum();
-    const regNumber dstReg = cast->GetRegNum();
-    emitter*        emit   = GetEmitter();
-
-    assert(genIsValidIntReg(srcReg));
-    assert(genIsValidIntReg(dstReg));
-
-    GenIntCastDesc desc(cast);
-
-    if (desc.CheckKind() != GenIntCastDesc::CHECK_NONE)
-    {
-        genIntCastOverflowCheck(cast, desc, srcReg);
-    }
-
-    if ((desc.ExtendKind() != GenIntCastDesc::COPY) || (srcReg != dstReg))
-    {
-        instruction ins;
-        unsigned    insSize;
-        bool        canSkip = false;
-
-        switch (desc.ExtendKind())
-        {
-            case GenIntCastDesc::ZERO_EXTEND_SMALL_INT:
-                ins     = INS_movzx;
-                insSize = desc.ExtendSrcSize();
-                break;
-            case GenIntCastDesc::SIGN_EXTEND_SMALL_INT:
-                ins     = INS_movsx;
-                insSize = desc.ExtendSrcSize();
-                break;
-#ifdef TARGET_64BIT
-            case GenIntCastDesc::ZERO_EXTEND_INT:
-                // We can skip emitting this zero extending move if the previous instruction zero extended implicitly
-                if ((srcReg == dstReg) && compiler->opts.OptimizationEnabled())
-                {
-                    canSkip = emit->AreUpper32BitsZero(srcReg);
-                }
-
-                ins     = INS_mov;
-                insSize = 4;
-                break;
-            case GenIntCastDesc::SIGN_EXTEND_INT:
-                ins     = INS_movsxd;
-                insSize = 4;
-                break;
-#endif
-            default:
-                assert(desc.ExtendKind() == GenIntCastDesc::COPY);
-                assert(srcReg != dstReg);
-                ins     = INS_mov;
-                insSize = desc.ExtendSrcSize();
-                break;
-        }
-
-        if (canSkip)
-        {
-            JITDUMP("\n -- suppressing emission as previous instruction already properly extends.\n");
-        }
-        else
-        {
-            emit->emitIns_R_R(ins, EA_ATTR(insSize), dstReg, srcReg);
-        }
-    }
-
-    genProduceReg(cast);
-}
-
-//------------------------------------------------------------------------
-// genFloatToFloatCast: Generate code for a cast between float and double
-//
-// Arguments:
-//    treeNode - The GT_CAST node
-//
-// Return Value:
-//    None.
-//
-// Assumptions:
-//    Cast is a non-overflow conversion.
-//    The treeNode must have an assigned register.
-//    The cast is between float and double or vice versa.
-//
-void CodeGen::genFloatToFloatCast(GenTree* treeNode)
-{
-    // float <--> double conversions are always non-overflow ones
-    assert(treeNode->OperGet() == GT_CAST);
-    assert(!treeNode->gtOverflow());
-
-    regNumber targetReg = treeNode->GetRegNum();
-    assert(genIsValidFloatReg(targetReg));
-
-    GenTree* op1 = treeNode->AsOp()->gtOp1;
-#ifdef DEBUG
-    // If not contained, must be a valid float reg.
-    if (op1->isUsedFromReg())
-    {
-        assert(genIsValidFloatReg(op1->GetRegNum()));
-    }
-#endif
-
-    var_types dstType = treeNode->CastToType();
-    var_types srcType = op1->TypeGet();
-    assert(varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
-
-    genConsumeOperands(treeNode->AsOp());
-    if (srcType == dstType && (op1->isUsedFromReg() && (targetReg == op1->GetRegNum())))
-    {
-        // source and destinations types are the same and also reside in the same register.
-        // we just need to consume and produce the reg in this case.
-        ;
-    }
-    else
-    {
-        instruction ins = ins_FloatConv(dstType, srcType);
-        GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
-    }
-
-    genProduceReg(treeNode);
-}
-
-//------------------------------------------------------------------------
-// genIntToFloatCast: Generate code to cast an int/long to float/double
-//
-// Arguments:
-//    treeNode - The GT_CAST node
-//
-// Return Value:
-//    None.
-//
-// Assumptions:
-//    Cast is a non-overflow conversion.
-//    The treeNode must have an assigned register.
-//    SrcType= int32/uint32/int64/uint64 and DstType=float/double.
-//
-void CodeGen::genIntToFloatCast(GenTree* treeNode)
-{
-    // int type --> float/double conversions are always non-overflow ones
-    assert(treeNode->OperGet() == GT_CAST);
-    assert(!treeNode->gtOverflow());
-
-    regNumber targetReg = treeNode->GetRegNum();
-    assert(genIsValidFloatReg(targetReg));
-
-    GenTree* op1 = treeNode->AsOp()->gtOp1;
-#ifdef DEBUG
-    if (op1->isUsedFromReg())
-    {
-        assert(genIsValidIntReg(op1->GetRegNum()));
-    }
-#endif
-
-    var_types dstType = treeNode->CastToType();
-    var_types srcType = op1->TypeGet();
-    assert(!varTypeIsFloating(srcType) && varTypeIsFloating(dstType));
-
-#if !defined(TARGET_64BIT)
-    // We expect morph to replace long to float/double casts with helper calls
-    noway_assert(!varTypeIsLong(srcType));
-#endif // !defined(TARGET_64BIT)
-
-    // Since xarch emitter doesn't handle reporting gc-info correctly while casting away gc-ness we
-    // ensure srcType of a cast is non gc-type.  Codegen should never see BYREF as source type except
-    // for GT_LCL_VAR_ADDR and GT_LCL_FLD_ADDR that represent stack addresses and can be considered
-    // as TYP_I_IMPL. In all other cases where src operand is a gc-type and not known to be on stack,
-    // Front-end (see fgMorphCast()) ensures this by assigning gc-type local to a non gc-type
-    // temp and using temp as operand of cast operation.
-    if (srcType == TYP_BYREF)
-    {
-        noway_assert(op1->OperGet() == GT_LCL_VAR_ADDR || op1->OperGet() == GT_LCL_FLD_ADDR);
-        srcType = TYP_I_IMPL;
-    }
-
-    // force the srcType to unsigned if GT_UNSIGNED flag is set
-    if (treeNode->gtFlags & GTF_UNSIGNED)
-    {
-        srcType = genUnsignedType(srcType);
-    }
-
-    noway_assert(!varTypeIsGC(srcType));
-
-    // We should never be seeing srcType whose size is not sizeof(int) nor sizeof(long).
-    // For conversions from byte/sbyte/int16/uint16 to float/double, we would expect
-    // either the front-end or lowering phase to have generated two levels of cast.
-    // The first one is for widening smaller int type to int32 and the second one is
-    // to the float/double.
-    emitAttr srcSize = EA_ATTR(genTypeSize(srcType));
-    noway_assert((srcSize == EA_ATTR(genTypeSize(TYP_INT))) || (srcSize == EA_ATTR(genTypeSize(TYP_LONG))));
-
-    // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions
-    // here since they should have been lowered apropriately.
-    noway_assert(srcType != TYP_UINT);
-    noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT));
-
-    // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used
-    // which does a partial write to lower 4/8 bytes of xmm register keeping the other
-    // upper bytes unmodified.  If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop,
-    // the partial write could introduce a false dependency and could cause a stall
-    // if there are further uses of xmmReg. We have such a case occurring with a
-    // customer reported version of SpectralNorm benchmark, resulting in 2x perf
-    // regression.  To avoid false dependency, we emit "xorps xmmReg, xmmReg" before
-    // cvtsi2ss/sd instruction.
-
-    genConsumeOperands(treeNode->AsOp());
-    GetEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->GetRegNum(), treeNode->GetRegNum());
-
-    // Note that here we need to specify srcType that will determine
-    // the size of source reg/mem operand and rex.w prefix.
-    instruction ins = ins_FloatConv(dstType, TYP_INT);
-    GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1);
-
-    // Handle the case of srcType = TYP_ULONG. SSE2 conversion instruction
-    // will interpret ULONG value as LONG.  Hence we need to adjust the
-    // result if sign-bit of srcType is set.
-    if (srcType == TYP_ULONG)
-    {
-        // The instruction sequence below is less accurate than what clang
-        // and gcc generate. However, we keep the current sequence for backward compatibility.
-        // If we change the instructions below, FloatingPointUtils::convertUInt64ToDobule
-        // should be also updated for consistent conversion result.
-        assert(dstType == TYP_DOUBLE);
-        assert(op1->isUsedFromReg());
-
-        // Set the flags without modifying op1.
-        // test op1Reg, op1Reg
-        inst_RV_RV(INS_test, op1->GetRegNum(), op1->GetRegNum(), srcType);
-
-        // No need to adjust result if op1 >= 0 i.e. positive
-        // Jge label
-        BasicBlock* label = genCreateTempLabel();
-        inst_JMP(EJ_jge, label);
-
-        // Adjust the result
-        // result = result + 0x43f00000 00000000
-        // addsd resultReg,  0x43f00000 00000000
-        CORINFO_FIELD_HANDLE* cns = &u8ToDblBitmask;
-        if (*cns == nullptr)
-        {
-            double d;
-            static_assert_no_msg(sizeof(double) == sizeof(__int64));
-            *((__int64*)&d) = 0x43f0000000000000LL;
-
-            *cns = GetEmitter()->emitFltOrDblConst(d, EA_8BYTE);
-        }
-        GetEmitter()->emitIns_R_C(INS_addsd, EA_8BYTE, treeNode->GetRegNum(), *cns, 0);
-
-        genDefineTempLabel(label);
-    }
-
-    genProduceReg(treeNode);
-}
-
-//------------------------------------------------------------------------
-// genFloatToIntCast: Generate code to cast float/double to int/long
-//
-// Arguments:
-//    treeNode - The GT_CAST node
-//
-// Return Value:
-//    None.
-//
-// Assumptions:
-//    Cast is a non-overflow conversion.
-//    The treeNode must have an assigned register.
-//    SrcType=float/double and DstType= int32/uint32/int64/uint64
-//
-// TODO-XArch-CQ: (Low-pri) - generate in-line code when DstType = uint64
-//
-void CodeGen::genFloatToIntCast(GenTree* treeNode)
-{
-    // we don't expect to see overflow detecting float/double --> int type conversions here
-    // as they should have been converted into helper calls by front-end.
-    assert(treeNode->OperGet() == GT_CAST);
-    assert(!treeNode->gtOverflow());
-
-    regNumber targetReg = treeNode->GetRegNum();
-    assert(genIsValidIntReg(targetReg));
-
-    GenTree* op1 = treeNode->AsOp()->gtOp1;
-#ifdef DEBUG
-    if (op1->isUsedFromReg())
-    {
-        assert(genIsValidFloatReg(op1->GetRegNum()));
-    }
-#endif
-
-    var_types dstType = treeNode->CastToType();
-    var_types srcType = op1->TypeGet();
-    assert(varTypeIsFloating(srcType) && !varTypeIsFloating(dstType));
-
-    // We should never be seeing dstType whose size is neither sizeof(TYP_INT) nor sizeof(TYP_LONG).
-    // For conversions to byte/sbyte/int16/uint16 from float/double, we would expect the
-    // front-end or lowering phase to have generated two levels of cast. The first one is
-    // for float or double to int32/uint32 and the second one for narrowing int32/uint32 to
-    // the required smaller int type.
-    emitAttr dstSize = EA_ATTR(genTypeSize(dstType));
-    noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
-
-    // We shouldn't be seeing uint64 here as it should have been converted
-    // into a helper call by either front-end or lowering phase.
-    noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))));
-
-    // If the dstType is TYP_UINT, we have 32-bits to encode the
-    // float number. Any of 33rd or above bits can be the sign bit.
-    // To achieve it we pretend as if we are converting it to a long.
-    if (varTypeIsUnsigned(dstType) && (dstSize == EA_ATTR(genTypeSize(TYP_INT))))
-    {
-        dstType = TYP_LONG;
-    }
-
-    // Note that we need to specify dstType here so that it will determine
-    // the size of destination integer register and also the rex.w prefix.
-    genConsumeOperands(treeNode->AsOp());
-    instruction ins = ins_FloatConv(TYP_INT, srcType);
-    GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1);
-    genProduceReg(treeNode);
-}
-
-//------------------------------------------------------------------------
-// genCkfinite: Generate code for ckfinite opcode.
-//
-// Arguments:
-//    treeNode - The GT_CKFINITE node
-//
-// Return Value:
-//    None.
-//
-// Assumptions:
-//    GT_CKFINITE node has reserved an internal register.
-//
-// TODO-XArch-CQ - mark the operand as contained if known to be in
-// memory (e.g. field or an array element).
-//
-void CodeGen::genCkfinite(GenTree* treeNode)
-{
-    assert(false);
-}
-
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
-int CodeGenInterface::genSPtoFPdelta() const
-{
-    int delta;
-
-#ifdef UNIX_AMD64_ABI
-
-    // We require frame chaining on Unix to support native tool unwinding (such as
-    // unwinding by the native debugger). We have a CLR-only extension to the
-    // unwind codes (UWOP_SET_FPREG_LARGE) to support SP->FP offsets larger than 240.
-    // If Unix ever supports EnC, the RSP == RBP assumption will have to be reevaluated.
-    delta = genTotalFrameSize();
-
-#else // !UNIX_AMD64_ABI
-
-    // As per Amd64 ABI, RBP offset from initial RSP can be between 0 and 240 if
-    // RBP needs to be reported in unwind codes.  This case would arise for methods
-    // with localloc.
-    if (compiler->compLocallocUsed)
-    {
-        // We cannot base delta computation on compLclFrameSize since it changes from
-        // tentative to final frame layout and hence there is a possibility of
-        // under-estimating offset of vars from FP, which in turn results in under-
-        // estimating instruction size.
-        //
-        // To be predictive and so as never to under-estimate offset of vars from FP
-        // we will always position FP at min(240, outgoing arg area size).
-        delta = Min(240, (int)compiler->lvaOutgoingArgSpaceSize);
-    }
-    else if (compiler->opts.compDbgEnC)
-    {
-        // vm assumption on EnC methods is that rsp and rbp are equal
-        delta = 0;
-    }
-    else
-    {
-        delta = genTotalFrameSize();
-    }
-
-#endif // !UNIX_AMD64_ABI
-
-    return delta;
-}
-
-//---------------------------------------------------------------------
-// genTotalFrameSize - return the total size of the stack frame, including local size,
-// callee-saved register size, etc. For AMD64, this does not include the caller-pushed
-// return address.
-//
-// Return value:
-//    Total frame size
-//
-
-int CodeGenInterface::genTotalFrameSize() const
-{
-    assert(!IsUninitialized(compiler->compCalleeRegsPushed));
-
-    int totalFrameSize = compiler->compCalleeRegsPushed * REGSIZE_BYTES + compiler->compLclFrameSize;
-
-    assert(totalFrameSize >= 0);
-    return totalFrameSize;
-}
-
-//---------------------------------------------------------------------
-// genCallerSPtoFPdelta - return the offset from Caller-SP to the frame pointer.
-// This number is going to be negative, since the Caller-SP is at a higher
-// address than the frame pointer.
-//
-// There must be a frame pointer to call this function!
-//
-// We can't compute this directly from the Caller-SP, since the frame pointer
-// is based on a maximum delta from Initial-SP, so first we find SP, then
-// compute the FP offset.
-
-int CodeGenInterface::genCallerSPtoFPdelta() const
-{
-    assert(isFramePointerUsed());
-    int callerSPtoFPdelta;
-
-    callerSPtoFPdelta = genCallerSPtoInitialSPdelta() + genSPtoFPdelta();
-
-    assert(callerSPtoFPdelta <= 0);
-    return callerSPtoFPdelta;
-}
-
-//---------------------------------------------------------------------
-// genCallerSPtoInitialSPdelta - return the offset from Caller-SP to Initial SP.
-//
-// This number will be negative.
-
-int CodeGenInterface::genCallerSPtoInitialSPdelta() const
-{
-    int callerSPtoSPdelta = 0;
-
-    callerSPtoSPdelta -= genTotalFrameSize();
-    callerSPtoSPdelta -= REGSIZE_BYTES; // caller-pushed return address
-
-    // compCalleeRegsPushed does not account for the frame pointer
-    // TODO-Cleanup: shouldn't this be part of genTotalFrameSize?
-    if (isFramePointerUsed())
-    {
-        callerSPtoSPdelta -= REGSIZE_BYTES;
-    }
-
-    assert(callerSPtoSPdelta <= 0);
-    return callerSPtoSPdelta;
-}
-#endif // TARGET_AMD64
-
-//-----------------------------------------------------------------------------------------
-// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
-//
-// Arguments:
-//    treeNode  - tree node
-//
-// Return value:
-//    None
-//
-// Assumptions:
-//     i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
-//    ii) tree type is floating point type.
-//   iii) caller of this routine needs to call genProduceReg()
-void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
-{
-    regNumber targetReg  = treeNode->GetRegNum();
-    regNumber operandReg = genConsumeReg(treeNode->gtGetOp1());
-    emitAttr  size       = emitTypeSize(treeNode);
-
-    assert(varTypeIsFloating(treeNode->TypeGet()));
-    assert(treeNode->gtGetOp1()->isUsedFromReg());
-
-    CORINFO_FIELD_HANDLE* maskFld = nullptr;
-    UINT64                mask    = 0;
-    instruction           ins     = INS_invalid;
-
-    if (treeNode->OperIs(GT_NEG))
-    {
-        // Neg(x) = flip the sign bit.
-        // Neg(f) = f ^ 0x80000000 x4 (packed)
-        // Neg(d) = d ^ 0x8000000000000000 x2 (packed)
-        ins     = INS_xorps;
-        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x8000000080000000UL : 0x8000000000000000UL;
-        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &negBitmaskFlt : &negBitmaskDbl;
-    }
-    else if (treeNode->OperIs(GT_INTRINSIC))
-    {
-        assert(treeNode->AsIntrinsic()->gtIntrinsicName == NI_System_Math_Abs);
-        // Abs(x) = set sign-bit to zero
-        // Abs(f) = f & 0x7fffffff x4 (packed)
-        // Abs(d) = d & 0x7fffffffffffffff x2 (packed)
-        ins     = INS_andps;
-        mask    = treeNode->TypeIs(TYP_FLOAT) ? 0x7fffffff7fffffffUL : 0x7fffffffffffffffUL;
-        maskFld = treeNode->TypeIs(TYP_FLOAT) ? &absBitmaskFlt : &absBitmaskDbl;
-    }
-    else
-    {
-        assert(!"genSSE2BitwiseOp: unsupported oper");
-    }
-
-    if (*maskFld == nullptr)
-    {
-        UINT64 maskPack[] = {mask, mask};
-        *maskFld          = GetEmitter()->emitBlkConst(&maskPack, 16, 16, treeNode->TypeGet());
-    }
-
-    GetEmitter()->emitIns_SIMD_R_R_C(ins, size, targetReg, operandReg, *maskFld, 0);
-}
-
-//-----------------------------------------------------------------------------------------
-// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation
-//
-// Arguments:
-//    treeNode  - tree node
-//
-// Return value:
-//    None
-//
-// Assumptions:
-//     i) SSE4.1 is supported by the underlying hardware
-//    ii) treeNode oper is a GT_INTRINSIC
-//   iii) treeNode type is a floating point type
-//    iv) treeNode is not used from memory
-//     v) tree oper is NI_System_Math{F}_Round, _Ceiling, or _Floor
-//    vi) caller of this routine needs to call genProduceReg()
-void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)
-{
-    assert(false);
-}
-
-//---------------------------------------------------------------------
-// genIntrinsic - generate code for a given intrinsic
-//
-// Arguments
-//    treeNode - the GT_INTRINSIC node
-//
-// Return value:
-//    None
-//
-void CodeGen::genIntrinsic(GenTree* treeNode)
-{
-    // Right now only Sqrt/Abs are treated as math intrinsics.
-    switch (treeNode->AsIntrinsic()->gtIntrinsicName)
-    {
-        case NI_System_Math_Sqrt:
-        {
-            // Both operand and its result must be of the same floating point type.
-            GenTree* srcNode = treeNode->AsOp()->gtOp1;
-            assert(varTypeIsFloating(srcNode));
-            assert(srcNode->TypeGet() == treeNode->TypeGet());
-
-            genConsumeOperands(treeNode->AsOp());
-            GetEmitter()->emitInsBinary(ins_FloatSqrt(treeNode->TypeGet()), emitTypeSize(treeNode), treeNode, srcNode);
-            break;
-        }
-
-        case NI_System_Math_Abs:
-            genSSE2BitwiseOp(treeNode);
-            break;
-
-        case NI_System_Math_Round:
-        case NI_System_Math_Ceiling:
-        case NI_System_Math_Floor:
-            genSSE41RoundOp(treeNode->AsOp());
-            break;
-
-        default:
-            assert(!"genIntrinsic: Unsupported intrinsic");
-            unreached();
-    }
-
-    genProduceReg(treeNode);
-}
-
-//----------------------------------------------------------------------
-// genBitCast - Generate the instruction to move a value between register files
-//
-// Arguments
-//    targetType - the destination type
-//    targetReg  - the destination register
-//    srcType    - the source type
-//    srcReg     - the source register
-//
-void CodeGen::genBitCast(var_types targetType, regNumber targetReg, var_types srcType, regNumber srcReg)
-{
-    const bool srcFltReg = varTypeUsesFloatReg(srcType) || varTypeIsSIMD(srcType);
-    assert(srcFltReg == genIsValidFloatReg(srcReg));
-    const bool dstFltReg = varTypeUsesFloatReg(targetType) || varTypeIsSIMD(targetType);
-    assert(dstFltReg == genIsValidFloatReg(targetReg));
-    if (srcFltReg != dstFltReg)
-    {
-        instruction ins;
-        regNumber   fltReg;
-        regNumber   intReg;
-        if (dstFltReg)
-        {
-            ins    = ins_CopyIntToFloat(srcType, targetType);
-            fltReg = targetReg;
-            intReg = srcReg;
-        }
-        else
-        {
-            ins    = ins_CopyFloatToInt(srcType, targetType);
-            intReg = targetReg;
-            fltReg = srcReg;
-        }
-        inst_RV_RV(ins, fltReg, intReg, targetType);
-    }
-    else if (targetReg != srcReg)
-    {
-        inst_RV_RV(ins_Copy(targetType), targetReg, srcReg, targetType);
-    }
-}
-
-//----------------------------------------------------------------------
-// genCodeForBitCast - Generate code for a GT_BITCAST that is not contained
-//
-// Arguments
-//    treeNode - the GT_BITCAST for which we're generating code
-//
-void CodeGen::genCodeForBitCast(GenTreeOp* treeNode)
-{
-    regNumber targetReg  = treeNode->GetRegNum();
-    var_types targetType = treeNode->TypeGet();
-    GenTree*  op1        = treeNode->gtGetOp1();
-    genConsumeRegs(op1);
-
-    if (op1->isContained())
-    {
-        assert(op1->IsLocal() || op1->isIndir());
-        if (genIsRegCandidateLocal(op1))
-        {
-            unsigned lclNum = op1->AsLclVar()->GetLclNum();
-            GetEmitter()->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lclNum)),
-                                      emitTypeSize(treeNode), targetReg, lclNum, 0);
-        }
-        else
-        {
-            op1->gtType = treeNode->TypeGet();
-            op1->SetRegNum(targetReg);
-            op1->ClearContained();
-            JITDUMP("Changing type of BITCAST source to load directly.");
-            genCodeForTreeNode(op1);
-        }
-    }
-    else
-    {
-        genBitCast(targetType, targetReg, op1->TypeGet(), op1->GetRegNum());
-    }
-    genProduceReg(treeNode);
-}
-
-//-------------------------------------------------------------------------- //
-// getBaseVarForPutArgStk - returns the baseVarNum for passing a stack arg.
-//
-// Arguments
-//    treeNode - the GT_PUTARG_STK node
-//
-// Return value:
-//    The number of the base variable.
-//
-// Note:
-//    If tail call the outgoing args are placed in the caller's incoming arg stack space.
-//    Otherwise, they go in the outgoing arg area on the current frame.
-//
-//    On Windows the caller always creates slots (homing space) in its frame for the
-//    first 4 arguments of a callee (register passed args). So, the baseVarNum is always 0.
-//    For System V systems there is no such calling convention requirement, and the code needs to find
-//    the first stack passed argument from the caller. This is done by iterating over
-//    all the lvParam variables and finding the first with GetArgReg() equals to REG_STK.
-//
-//unsigned CodeGen::getBaseVarForPutArgStk(GenTree* treeNode)
-//{
-//    assert(false);
-//}
-
-//---------------------------------------------------------------------
-// genAlignStackBeforeCall: Align the stack if necessary before a call.
-//
-// Arguments:
-//    call - the call node.
-//
-void CodeGen::genAlignStackBeforeCall(GenTreeCall* call)
-{
-#if defined(UNIX_X86_ABI)
-
-    // Have we aligned the stack yet?
-    if (!call->fgArgInfo->IsStkAlignmentDone())
-    {
-        // We haven't done any stack alignment yet for this call.  We might need to create
-        // an alignment adjustment, even if this function itself doesn't have any stack args.
-        // This can happen if this function call is part of a nested call sequence, and the outer
-        // call has already pushed some arguments.
-
-        unsigned stkLevel = genStackLevel + call->fgArgInfo->GetStkSizeBytes();
-        call->fgArgInfo->ComputeStackAlignment(stkLevel);
-
-        unsigned padStkAlign = call->fgArgInfo->GetStkAlign();
-        if (padStkAlign != 0)
-        {
-            // Now generate the alignment
-            inst_RV_IV(INS_sub, REG_SPBASE, padStkAlign, EA_PTRSIZE);
-            AddStackLevel(padStkAlign);
-            AddNestedAlignment(padStkAlign);
-        }
-
-        call->fgArgInfo->SetStkAlignmentDone();
-    }
-
-#endif // UNIX_X86_ABI
-}
-
-//---------------------------------------------------------------------
-// genRemoveAlignmentAfterCall: After a call, remove the alignment
-// added before the call, if any.
-//
-// Arguments:
-//    call - the call node.
-//    bias - additional stack adjustment
-//
-// Note:
-//    When bias > 0, caller should adjust stack level appropriately as
-//    bias is not considered when adjusting stack level.
-//
-void CodeGen::genRemoveAlignmentAfterCall(GenTreeCall* call, unsigned bias)
-{
-#if defined(TARGET_X86)
-#if defined(UNIX_X86_ABI)
-    // Put back the stack pointer if there was any padding for stack alignment
-    unsigned padStkAlign  = call->fgArgInfo->GetStkAlign();
-    unsigned padStkAdjust = padStkAlign + bias;
-
-    if (padStkAdjust != 0)
-    {
-        inst_RV_IV(INS_add, REG_SPBASE, padStkAdjust, EA_PTRSIZE);
-        SubtractStackLevel(padStkAlign);
-        SubtractNestedAlignment(padStkAlign);
-    }
-#else  // UNIX_X86_ABI
-    if (bias != 0)
-    {
-        genAdjustSP(bias);
-    }
-#endif // !UNIX_X86_ABI_
-#else  // TARGET_X86
-    assert(bias == 0);
-#endif // !TARGET_X86
-}
-
-#ifdef TARGET_X86
-
-//---------------------------------------------------------------------
-// genAdjustStackForPutArgStk:
-//    adjust the stack pointer for a putArgStk node if necessary.
-//
-// Arguments:
-//    putArgStk - the putArgStk node.
-//
-// Returns: true if the stack pointer was adjusted; false otherwise.
-//
-// Notes:
-//    Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
-//    false if the stack arg needs to be stored at the current stack
-//    pointer address. This is exactly the opposite of the return value
-//    of this function.
-//
-bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
-{
-    const unsigned argSize = putArgStk->GetStackByteSize();
-    GenTree*       source  = putArgStk->gtGetOp1();
-
-#ifdef FEATURE_SIMD
-    if (!source->OperIs(GT_FIELD_LIST) && varTypeIsSIMD(source))
-    {
-        inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
-        AddStackLevel(argSize);
-        m_pushStkArg = false;
-        return true;
-    }
-#endif // FEATURE_SIMD
-
-    // If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack.
-    // This is set in Lowering, and is true if and only if:
-    // - This argument contains any GC pointers OR
-    // - It is a GT_FIELD_LIST OR
-    // - It is less than 16 bytes in size.
-    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef DEBUG
-    switch (putArgStk->gtPutArgStkKind)
-    {
-        case GenTreePutArgStk::Kind::RepInstr:
-        case GenTreePutArgStk::Kind::Unroll:
-            assert(!source->AsObj()->GetLayout()->HasGCPtr() && (argSize >= 16));
-            break;
-        case GenTreePutArgStk::Kind::Push:
-        case GenTreePutArgStk::Kind::PushAllSlots:
-            assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || (argSize < 16));
-            break;
-        case GenTreePutArgStk::Kind::Invalid:
-        default:
-            assert(!"Uninitialized GenTreePutArgStk::Kind");
-            break;
-    }
-#endif // DEBUG
-
-    if (putArgStk->isPushKind())
-    {
-        m_pushStkArg = true;
-        return false;
-    }
-    else
-    {
-        m_pushStkArg = false;
-
-        // If argSize is large, we need to probe the stack like we do in the prolog (genAllocLclFrame)
-        // or for localloc (genLclHeap), to ensure we touch the stack pages sequentially, and don't miss
-        // the stack guard pages. The prolog probes, but we don't know at this point how much higher
-        // the last probed stack pointer value is. We default a threshold. Any size below this threshold
-        // we are guaranteed the stack has been probed. Above this threshold, we don't know. The threshold
-        // should be high enough to cover all common cases. Increasing the threshold means adding a few
-        // more "lowest address of stack" probes in the prolog. Since this is relatively rare, add it to
-        // stress modes.
-
-        if ((argSize >= ARG_STACK_PROBE_THRESHOLD_BYTES) ||
-            compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 5))
-        {
-            genStackPointerConstantAdjustmentLoopWithProbe(-(ssize_t)argSize, REG_NA);
-        }
-        else
-        {
-            inst_RV_IV(INS_sub, REG_SPBASE, argSize, EA_PTRSIZE);
-        }
-
-        AddStackLevel(argSize);
-        return true;
-    }
-}
-
-//---------------------------------------------------------------------
-// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
-//
-// Arguments
-//    treeNode      - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
-//
-// Return value:
-//    None
-//
-void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
-{
-    GenTreeFieldList* const fieldList = putArgStk->gtOp1->AsFieldList();
-    assert(fieldList != nullptr);
-
-    // Set m_pushStkArg and pre-adjust the stack if necessary.
-    const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
-
-    // For now, we only support the "push" case; we will push a full slot for the first field of each slot
-    // within the struct.
-    assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
-
-    // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
-    // (Note that this mode is not currently being used.)
-    // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
-    // in reverse order, so we start with the current field offset at the size of the struct arg (which must be
-    // a multiple of the target pointer size).
-    unsigned  currentOffset   = (preAdjustedStack) ? 0 : putArgStk->GetStackByteSize();
-    unsigned  prevFieldOffset = currentOffset;
-    regNumber intTmpReg       = REG_NA;
-    regNumber simdTmpReg      = REG_NA;
-    if (putArgStk->AvailableTempRegCount() != 0)
-    {
-        regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
-        if ((rsvdRegs & RBM_ALLINT) != 0)
-        {
-            intTmpReg = putArgStk->GetSingleTempReg(RBM_ALLINT);
-            assert(genIsValidIntReg(intTmpReg));
-        }
-        if ((rsvdRegs & RBM_ALLFLOAT) != 0)
-        {
-            simdTmpReg = putArgStk->GetSingleTempReg(RBM_ALLFLOAT);
-            assert(genIsValidFloatReg(simdTmpReg));
-        }
-        assert(genCountBits(rsvdRegs) == (unsigned)((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
-    }
-
-    for (GenTreeFieldList::Use& use : fieldList->Uses())
-    {
-        GenTree* const fieldNode   = use.GetNode();
-        const unsigned fieldOffset = use.GetOffset();
-        var_types      fieldType   = use.GetType();
-
-        // Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the
-        // field list in descending order by offset.
-        assert(!varTypeIsLong(fieldType));
-        assert(fieldOffset <= prevFieldOffset);
-
-        // Consume the register, if any, for this field. Note that genConsumeRegs() will appropriately
-        // update the liveness info for a lclVar that has been marked RegOptional, which hasn't been
-        // assigned a register, and which is therefore contained.
-        // Unlike genConsumeReg(), it handles the case where no registers are being consumed.
-        genConsumeRegs(fieldNode);
-        regNumber argReg = fieldNode->isUsedFromSpillTemp() ? REG_NA : fieldNode->GetRegNum();
-
-        // If the field is slot-like, we can use a push instruction to store the entire register no matter the type.
-        //
-        // The GC encoder requires that the stack remain 4-byte aligned at all times. Round the adjustment up
-        // to the next multiple of 4. If we are going to generate a `push` instruction, the adjustment must
-        // not require rounding.
-        // NOTE: if the field is of GC type, we must use a push instruction, since the emitter is not otherwise
-        // able to detect stores into the outgoing argument area of the stack on x86.
-        const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
-        int        adjustment  = roundUp(currentOffset - fieldOffset, 4);
-        if (fieldIsSlot && !varTypeIsSIMD(fieldType))
-        {
-            fieldType         = genActualType(fieldType);
-            unsigned pushSize = genTypeSize(fieldType);
-            assert((pushSize % 4) == 0);
-            adjustment -= pushSize;
-            while (adjustment != 0)
-            {
-                inst_IV(INS_push, 0);
-                currentOffset -= pushSize;
-                AddStackLevel(pushSize);
-                adjustment -= pushSize;
-            }
-            m_pushStkArg = true;
-        }
-        else
-        {
-            m_pushStkArg = false;
-
-            // We always "push" floating point fields (i.e. they are full slot values that don't
-            // require special handling).
-            assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
-
-            // If we can't push this field, it needs to be in a register so that we can store
-            // it to the stack location.
-            if (adjustment != 0)
-            {
-                // This moves the stack pointer to fieldOffset.
-                // For this case, we must adjust the stack and generate stack-relative stores rather than pushes.
-                // Adjust the stack pointer to the next slot boundary.
-                inst_RV_IV(INS_sub, REG_SPBASE, adjustment, EA_PTRSIZE);
-                currentOffset -= adjustment;
-                AddStackLevel(adjustment);
-            }
-
-            // Does it need to be in a byte register?
-            // If so, we'll use intTmpReg, which must have been allocated as a byte register.
-            // If it's already in a register, but not a byteable one, then move it.
-            if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
-            {
-                assert(intTmpReg != REG_NA);
-                noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
-                if (argReg != REG_NA)
-                {
-                    inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
-                    argReg = intTmpReg;
-                }
-            }
-        }
-
-        if (argReg == REG_NA)
-        {
-            if (m_pushStkArg)
-            {
-                if (fieldNode->isUsedFromSpillTemp())
-                {
-                    assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
-                    assert(fieldNode->IsRegOptional());
-                    TempDsc* tmp = getSpillTempDsc(fieldNode);
-                    GetEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
-                    regSet.tmpRlsTemp(tmp);
-                }
-                else
-                {
-                    assert(varTypeIsIntegralOrI(fieldNode));
-                    switch (fieldNode->OperGet())
-                    {
-                        case GT_LCL_VAR:
-                            inst_TT(INS_push, fieldNode, 0, 0, emitActualTypeSize(fieldNode->TypeGet()));
-                            break;
-                        case GT_CNS_INT:
-                            if (fieldNode->IsIconHandle())
-                            {
-                                inst_IV_handle(INS_push, fieldNode->AsIntCon()->gtIconVal);
-                            }
-                            else
-                            {
-                                inst_IV(INS_push, fieldNode->AsIntCon()->gtIconVal);
-                            }
-                            break;
-                        default:
-                            unreached();
-                    }
-                }
-                currentOffset -= TARGET_POINTER_SIZE;
-                AddStackLevel(TARGET_POINTER_SIZE);
-            }
-            else
-            {
-                // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
-                assert(varTypeIsIntegralOrI(fieldNode));
-                switch (fieldNode->OperGet())
-                {
-                    case GT_LCL_VAR:
-                        inst_RV_TT(INS_mov, intTmpReg, fieldNode);
-                        break;
-                    case GT_CNS_INT:
-                        genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
-                        break;
-                    default:
-                        unreached();
-                }
-                genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
-            }
-        }
-        else
-        {
-#if defined(FEATURE_SIMD)
-            if (fieldType == TYP_SIMD12)
-            {
-                assert(genIsValidFloatReg(simdTmpReg));
-                genStoreSIMD12ToStack(argReg, simdTmpReg);
-            }
-            else
-#endif // defined(FEATURE_SIMD)
-            {
-                genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
-            }
-            if (m_pushStkArg)
-            {
-                // We always push a slot-rounded size
-                currentOffset -= genTypeSize(fieldType);
-            }
-        }
-
-        prevFieldOffset = fieldOffset;
-    }
-    if (currentOffset != 0)
-    {
-        // We don't expect padding at the beginning of a struct, but it could happen with explicit layout.
-        inst_RV_IV(INS_sub, REG_SPBASE, currentOffset, EA_PTRSIZE);
-        AddStackLevel(currentOffset);
-    }
-}
-#endif // TARGET_X86
-
-//---------------------------------------------------------------------
-// genPutArgStk - generate code for passing an arg on the stack.
-//
-// Arguments
-//    treeNode      - the GT_PUTARG_STK node
-//    targetType    - the type of the treeNode
-//
-// Return value:
-//    None
-//
-void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)
-{
-    assert(false);
-}
-
-//---------------------------------------------------------------------
-// genPutArgReg - generate code for a GT_PUTARG_REG node
-//
-// Arguments
-//    tree - the GT_PUTARG_REG node
-//
-// Return value:
-//    None
-//
-void CodeGen::genPutArgReg(GenTreeOp* tree)
-{
-    assert(tree->OperIs(GT_PUTARG_REG));
-
-    var_types targetType = tree->TypeGet();
-    regNumber targetReg  = tree->GetRegNum();
-
-#ifndef UNIX_AMD64_ABI
-    assert(targetType != TYP_STRUCT);
-#endif // !UNIX_AMD64_ABI
-
-    GenTree* op1 = tree->gtOp1;
-    genConsumeReg(op1);
-
-    // If child node is not already in the register we need, move it
-    if (targetReg != op1->GetRegNum())
-    {
-        inst_RV_RV(ins_Copy(targetType), targetReg, op1->GetRegNum(), targetType);
-    }
-
-    genProduceReg(tree);
-}
-
-#ifdef TARGET_X86
-// genPushReg: Push a register value onto the stack and adjust the stack level
-//
-// Arguments:
-//    type   - the type of value to be stored
-//    reg    - the register containing the value
-//
-// Notes:
-//    For TYP_LONG, the srcReg must be a floating point register.
-//    Otherwise, the register type must be consistent with the given type.
-//
-void CodeGen::genPushReg(var_types type, regNumber srcReg)
-{
-    unsigned size = genTypeSize(type);
-    if (varTypeIsIntegralOrI(type) && type != TYP_LONG)
-    {
-        assert(genIsValidIntReg(srcReg));
-        inst_RV(INS_push, srcReg, type);
-    }
-    else
-    {
-        instruction ins;
-        emitAttr    attr = emitTypeSize(type);
-        if (type == TYP_LONG)
-        {
-            // On x86, the only way we can push a TYP_LONG from a register is if it is in an xmm reg.
-            // This is only used when we are pushing a struct from memory to memory, and basically is
-            // handling an 8-byte "chunk", as opposed to strictly a long type.
-            ins = INS_movq;
-        }
-        else
-        {
-            ins = ins_Store(type);
-        }
-        assert(genIsValidFloatReg(srcReg));
-        inst_RV_IV(INS_sub, REG_SPBASE, size, EA_PTRSIZE);
-        GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, 0);
-    }
-    AddStackLevel(size);
-}
-#endif // TARGET_X86
-
-#if defined(FEATURE_PUT_STRUCT_ARG_STK)
-// genStoreRegToStackArg: Store a register value into the stack argument area
-//
-// Arguments:
-//    type   - the type of value to be stored
-//    reg    - the register containing the value
-//    offset - the offset from the base (see Assumptions below)
-//
-// Notes:
-//    A type of TYP_STRUCT instructs this method to store a 16-byte chunk
-//    at the given offset (i.e. not the full struct).
-//
-// Assumptions:
-//    The caller must set the context appropriately before calling this method:
-//    - On x64, m_stkArgVarNum must be set according to whether this is a regular or tail call.
-//    - On x86, the caller must set m_pushStkArg if this method should push the argument.
-//      Otherwise, the argument is stored at the given offset from sp.
-//
-// TODO: In the below code the load and store instructions are for 16 bytes, but the
-//          type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
-//          this probably needs to be changed.
-//
-void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset)
-{
-    assert(srcReg != REG_NA);
-    instruction ins;
-    emitAttr    attr;
-    unsigned    size;
-
-    if (type == TYP_STRUCT)
-    {
-        ins = INS_movdqu;
-        // This should be changed!
-        attr = EA_8BYTE;
-        size = 16;
-    }
-    else
-    {
-#ifdef FEATURE_SIMD
-        if (varTypeIsSIMD(type))
-        {
-            assert(genIsValidFloatReg(srcReg));
-            ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
-        }
-        else
-#endif // FEATURE_SIMD
-#ifdef TARGET_X86
-            if (type == TYP_LONG)
-        {
-            assert(genIsValidFloatReg(srcReg));
-            ins = INS_movq;
-        }
-        else
-#endif // TARGET_X86
-        {
-            assert((varTypeUsesFloatReg(type) && genIsValidFloatReg(srcReg)) ||
-                   (varTypeIsIntegralOrI(type) && genIsValidIntReg(srcReg)));
-            ins = ins_Store(type);
-        }
-        attr = emitTypeSize(type);
-        size = genTypeSize(type);
-    }
-
-#ifdef TARGET_X86
-    if (m_pushStkArg)
-    {
-        genPushReg(type, srcReg);
-    }
-    else
-    {
-        GetEmitter()->emitIns_AR_R(ins, attr, srcReg, REG_SPBASE, offset);
-    }
-#else  // !TARGET_X86
-    assert(m_stkArgVarNum != BAD_VAR_NUM);
-    GetEmitter()->emitIns_S_R(ins, attr, srcReg, m_stkArgVarNum, m_stkArgOffset + offset);
-#endif // !TARGET_X86
-}
-
-//---------------------------------------------------------------------
-// genPutStructArgStk - generate code for copying a struct arg on the stack by value.
-//                In case there are references to heap object in the struct,
-//                it generates the gcinfo as well.
-//
-// Arguments
-//    putArgStk - the GT_PUTARG_STK node
-//
-// Notes:
-//    In the case of fixed out args, the caller must have set m_stkArgVarNum to the variable number
-//    corresponding to the argument area (where we will put the argument on the stack).
-//    For tail calls this is the baseVarNum = 0.
-//    For non tail calls this is the outgoingArgSpace.
-void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
-{
-    assert(false);
-}
-#endif // defined(FEATURE_PUT_STRUCT_ARG_STK)
-
-/*****************************************************************************
- *
- *  Create and record GC Info for the function.
- */
-#ifndef JIT32_GCENCODER
-void
-#else  // !JIT32_GCENCODER
-void*
-#endif // !JIT32_GCENCODER
-CodeGen::genCreateAndStoreGCInfo(unsigned codeSize, unsigned prologSize, unsigned epilogSize DEBUGARG(void* codePtr))
-{
-#ifdef JIT32_GCENCODER
-    return genCreateAndStoreGCInfoJIT32(codeSize, prologSize, epilogSize DEBUGARG(codePtr));
-#else  // !JIT32_GCENCODER
-    genCreateAndStoreGCInfoX64(codeSize, prologSize DEBUGARG(codePtr));
-#endif // !JIT32_GCENCODER
-}
-
-#ifdef JIT32_GCENCODER
-void* CodeGen::genCreateAndStoreGCInfoJIT32(unsigned codeSize,
-                                            unsigned prologSize,
-                                            unsigned epilogSize DEBUGARG(void* codePtr))
-{
-    BYTE    headerBuf[64];
-    InfoHdr header;
-
-    int s_cached;
-
-#ifdef FEATURE_EH_FUNCLETS
-    // We should do this before gcInfoBlockHdrSave since varPtrTableSize must be finalized before it
-    if (compiler->ehAnyFunclets())
-    {
-        gcInfo.gcMarkFilterVarsPinned();
-    }
-#endif
-
-#ifdef DEBUG
-    size_t headerSize =
-#endif
-        compiler->compInfoBlkSize =
-            gcInfo.gcInfoBlockHdrSave(headerBuf, 0, codeSize, prologSize, epilogSize, &header, &s_cached);
-
-    size_t argTabOffset = 0;
-    size_t ptrMapSize   = gcInfo.gcPtrTableSize(header, codeSize, &argTabOffset);
-
-#if DISPLAY_SIZES
-
-    if (GetInterruptible())
-    {
-        gcHeaderISize += compiler->compInfoBlkSize;
-        gcPtrMapISize += ptrMapSize;
-    }
-    else
-    {
-        gcHeaderNSize += compiler->compInfoBlkSize;
-        gcPtrMapNSize += ptrMapSize;
-    }
-
-#endif // DISPLAY_SIZES
-
-    compiler->compInfoBlkSize += ptrMapSize;
-
-    /* Allocate the info block for the method */
-
-    compiler->compInfoBlkAddr = (BYTE*)compiler->info.compCompHnd->allocGCInfo(compiler->compInfoBlkSize);
-
-#if 0 // VERBOSE_SIZES
-    // TODO-X86-Cleanup: 'dataSize', below, is not defined
-
-//  if  (compiler->compInfoBlkSize > codeSize && compiler->compInfoBlkSize > 100)
-    {
-        printf("[%7u VM, %7u+%7u/%7u x86 %03u/%03u%%] %s.%s\n",
-               compiler->info.compILCodeSize,
-               compiler->compInfoBlkSize,
-               codeSize + dataSize,
-               codeSize + dataSize - prologSize - epilogSize,
-               100 * (codeSize + dataSize) / compiler->info.compILCodeSize,
-               100 * (codeSize + dataSize + compiler->compInfoBlkSize) / compiler->info.compILCodeSize,
-               compiler->info.compClassName,
-               compiler->info.compMethodName);
-}
-
-#endif
-
-    /* Fill in the info block and return it to the caller */
-
-    void* infoPtr = compiler->compInfoBlkAddr;
-
-    /* Create the method info block: header followed by GC tracking tables */
-
-    compiler->compInfoBlkAddr +=
-        gcInfo.gcInfoBlockHdrSave(compiler->compInfoBlkAddr, -1, codeSize, prologSize, epilogSize, &header, &s_cached);
-
-    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize);
-    compiler->compInfoBlkAddr = gcInfo.gcPtrTableSave(compiler->compInfoBlkAddr, header, codeSize, &argTabOffset);
-    assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + headerSize + ptrMapSize);
-
-#ifdef DEBUG
-
-    if (0)
-    {
-        BYTE*  temp = (BYTE*)infoPtr;
-        size_t size = compiler->compInfoBlkAddr - temp;
-        BYTE*  ptab = temp + headerSize;
-
-        noway_assert(size == headerSize + ptrMapSize);
-
-        printf("Method info block - header [%zu bytes]:", headerSize);
-
-        for (unsigned i = 0; i < size; i++)
-        {
-            if (temp == ptab)
-            {
-                printf("\nMethod info block - ptrtab [%u bytes]:", ptrMapSize);
-                printf("\n    %04X: %*c", i & ~0xF, 3 * (i & 0xF), ' ');
-            }
-            else
-            {
-                if (!(i % 16))
-                    printf("\n    %04X: ", i);
-            }
-
-            printf("%02X ", *temp++);
-        }
-
-        printf("\n");
-    }
-
-#endif // DEBUG
-
-#if DUMP_GC_TABLES
-
-    if (compiler->opts.dspGCtbls)
-    {
-        const BYTE* base = (BYTE*)infoPtr;
-        size_t      size;
-        unsigned    methodSize;
-        InfoHdr     dumpHeader;
-
-        printf("GC Info for method %s\n", compiler->info.compFullName);
-        printf("GC info size = %3u\n", compiler->compInfoBlkSize);
-
-        size = gcInfo.gcInfoBlockHdrDump(base, &dumpHeader, &methodSize);
-        // printf("size of header encoding is %3u\n", size);
-        printf("\n");
-
-        if (compiler->opts.dspGCtbls)
-        {
-            base += size;
-            size = gcInfo.gcDumpPtrTable(base, dumpHeader, methodSize);
-            // printf("size of pointer table is %3u\n", size);
-            printf("\n");
-            noway_assert(compiler->compInfoBlkAddr == (base + size));
-        }
-    }
-
-#endif // DUMP_GC_TABLES
-
-    /* Make sure we ended up generating the expected number of bytes */
-
-    noway_assert(compiler->compInfoBlkAddr == (BYTE*)infoPtr + compiler->compInfoBlkSize);
-
-    return infoPtr;
-}
-
-#else  // !JIT32_GCENCODER
-void CodeGen::genCreateAndStoreGCInfoX64(unsigned codeSize, unsigned prologSize DEBUGARG(void* codePtr))
-{
-    IAllocator*    allowZeroAlloc = new (compiler, CMK_GC) CompIAllocator(compiler->getAllocatorGC());
-    GcInfoEncoder* gcInfoEncoder  = new (compiler, CMK_GC)
-        GcInfoEncoder(compiler->info.compCompHnd, compiler->info.compMethodInfo, allowZeroAlloc, NOMEM);
-    assert(gcInfoEncoder);
-
-    // Follow the code pattern of the x86 gc info encoder (genCreateAndStoreGCInfoJIT32).
-    gcInfo.gcInfoBlockHdrSave(gcInfoEncoder, codeSize, prologSize);
-
-    // We keep the call count for the second call to gcMakeRegPtrTable() below.
-    unsigned callCnt = 0;
-    // First we figure out the encoder ID's for the stack slots and registers.
-    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_ASSIGN_SLOTS, &callCnt);
-    // Now we've requested all the slots we'll need; "finalize" these (make more compact data structures for them).
-    gcInfoEncoder->FinalizeSlotIds();
-    // Now we can actually use those slot ID's to declare live ranges.
-    gcInfo.gcMakeRegPtrTable(gcInfoEncoder, codeSize, prologSize, GCInfo::MAKE_REG_PTR_MODE_DO_WORK, &callCnt);
-
-    if (compiler->opts.compDbgEnC)
-    {
-        // what we have to preserve is called the "frame header" (see comments in VM\eetwain.cpp)
-        // which is:
-        //  -return address
-        //  -saved off RBP
-        //  -saved 'this' pointer and bool for synchronized methods
-
-        // 4 slots for RBP + return address + RSI + RDI
-        int preservedAreaSize = 4 * REGSIZE_BYTES;
-
-        if (compiler->info.compFlags & CORINFO_FLG_SYNCH)
-        {
-            if (!(compiler->info.compFlags & CORINFO_FLG_STATIC))
-            {
-                preservedAreaSize += REGSIZE_BYTES;
-            }
-
-            // bool in synchronized methods that tracks whether the lock has been taken (takes 4 bytes on stack)
-            preservedAreaSize += 4;
-        }
-
-        // Used to signal both that the method is compiled for EnC, and also the size of the block at the top of the
-        // frame
-        gcInfoEncoder->SetSizeOfEditAndContinuePreservedArea(preservedAreaSize);
-    }
-
-    if (compiler->opts.IsReversePInvoke())
-    {
-        unsigned reversePInvokeFrameVarNumber = compiler->lvaReversePInvokeFrameVar;
-        assert(reversePInvokeFrameVarNumber != BAD_VAR_NUM && reversePInvokeFrameVarNumber < compiler->lvaRefCount);
-        LclVarDsc& reversePInvokeFrameVar = compiler->lvaTable[reversePInvokeFrameVarNumber];
-        gcInfoEncoder->SetReversePInvokeFrameSlot(reversePInvokeFrameVar.GetStackOffset());
-    }
-
-    gcInfoEncoder->Build();
-
-    // GC Encoder automatically puts the GC info in the right spot using ICorJitInfo::allocGCInfo(size_t)
-    // let's save the values anyway for debugging purposes
-    compiler->compInfoBlkAddr = gcInfoEncoder->Emit();
-    compiler->compInfoBlkSize = 0; // not exposed by the GCEncoder interface
-}
-#endif // !JIT32_GCENCODER
-
-/*****************************************************************************
- *  Emit a call to a helper function.
- *
- */
-
-void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regNumber callTargetReg)
-{
-    void* addr  = nullptr;
-    void* pAddr = nullptr;
-
-    emitter::EmitCallType callType = emitter::EC_FUNC_TOKEN;
-    addr                           = compiler->compGetHelperFtn((CorInfoHelpFunc)helper, &pAddr);
-    regNumber callTarget           = REG_NA;
-    regMaskTP killMask             = compiler->compHelperCallKillSet((CorInfoHelpFunc)helper);
-
-    if (!addr)
-    {
-        assert(pAddr != nullptr);
-
-        // Absolute indirect call addr
-        // Note: Order of checks is important. First always check for pc-relative and next
-        // zero-relative.  Because the former encoding is 1-byte smaller than the latter.
-        if (genCodeIndirAddrCanBeEncodedAsPCRelOffset((size_t)pAddr) ||
-            genCodeIndirAddrCanBeEncodedAsZeroRelOffset((size_t)pAddr))
-        {
-            // generate call whose target is specified by 32-bit offset relative to PC or zero.
-            callType = emitter::EC_FUNC_TOKEN_INDIR;
-            addr     = pAddr;
-        }
-        else
-        {
-#ifdef TARGET_AMD64
-            // If this indirect address cannot be encoded as 32-bit offset relative to PC or Zero,
-            // load it into REG_HELPER_CALL_TARGET and use register indirect addressing mode to
-            // make the call.
-            //    mov   reg, addr
-            //    call  [reg]
-
-            if (callTargetReg == REG_NA)
-            {
-                // If a callTargetReg has not been explicitly provided, we will use REG_DEFAULT_HELPER_CALL_TARGET, but
-                // this is only a valid assumption if the helper call is known to kill REG_DEFAULT_HELPER_CALL_TARGET.
-                callTargetReg            = REG_DEFAULT_HELPER_CALL_TARGET;
-                regMaskTP callTargetMask = genRegMask(callTargetReg);
-                noway_assert((callTargetMask & killMask) == callTargetMask);
-            }
-            else
-            {
-                // The call target must not overwrite any live variable, though it may not be in the
-                // kill set for the call.
-                regMaskTP callTargetMask = genRegMask(callTargetReg);
-                noway_assert((callTargetMask & regSet.GetMaskVars()) == RBM_NONE);
-            }
-#endif
-
-            callTarget = callTargetReg;
-            CodeGen::genSetRegToIcon(callTarget, (ssize_t)pAddr, TYP_I_IMPL);
-            callType = emitter::EC_INDIR_ARD;
-        }
-    }
-
-    // clang-format off
-    GetEmitter()->emitIns_Call(callType,
-                               compiler->eeFindHelper(helper),
-                               INDEBUG_LDISASM_COMMA(nullptr) addr,
-                               argSize,
-                               retSize
-                               MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(EA_UNKNOWN),
-                               gcInfo.gcVarPtrSetCur,
-                               gcInfo.gcRegGCrefSetCur,
-                               gcInfo.gcRegByrefSetCur,
-                               BAD_IL_OFFSET, // IL offset
-                               callTarget,    // ireg
-                               REG_NA, 0, 0,  // xreg, xmul, disp
-                               false         // isJump
-                               );
-    // clang-format on
-
-    regSet.verifyRegistersUsed(killMask);
-}
-
-/*****************************************************************************
-* Unit testing of the XArch emitter: generate a bunch of instructions into the prolog
-* (it's as good a place as any), then use COMPlus_JitLateDisasm=* to see if the late
-* disassembler thinks the instructions as the same as we do.
-*/
-
-// Uncomment "#define ALL_ARM64_EMITTER_UNIT_TESTS" to run all the unit tests here.
-// After adding a unit test, and verifying it works, put it under this #ifdef, so we don't see it run every time.
-//#define ALL_XARCH_EMITTER_UNIT_TESTS
-
-#if defined(DEBUG) && defined(LATE_DISASM) && defined(TARGET_AMD64)
-void CodeGen::genAmd64EmitterUnitTests()
-{
-    if (!verbose)
-    {
-        return;
-    }
-
-    if (!compiler->opts.altJit)
-    {
-        // No point doing this in a "real" JIT.
-        return;
-    }
-
-    // Mark the "fake" instructions in the output.
-    printf("*************** In genAmd64EmitterUnitTests()\n");
-
-    // We use this:
-    //      genDefineTempLabel(genCreateTempLabel());
-    // to create artificial labels to help separate groups of tests.
-
-    //
-    // Loads
-    //
-    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef ALL_XARCH_EMITTER_UNIT_TESTS
-    genDefineTempLabel(genCreateTempLabel());
-
-    // vhaddpd     ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_haddpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_addss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_addsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddps      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_addps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_addps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddpd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_addpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vaddpd      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_addpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_subss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_subsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_subps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_subps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubpd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_subpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vsubpd      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_subpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_mulss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_mulsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulps      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_mulps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulpd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_mulpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_mulps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vmulpd      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_mulpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vandps      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_andps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vandpd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_andpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vandps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_andps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vandpd      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_andpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vorps      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_orps, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vorpd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_orpd, EA_16BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vorps      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_orps, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vorpd      ymm0,ymm1,ymm2
-    GetEmitter()->emitIns_R_R_R(INS_orpd, EA_32BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vdivss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vdivsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vdivss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_divss, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vdivsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_divsd, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-
-    // vdivss      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_cvtss2sd, EA_4BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-    // vdivsd      xmm0,xmm1,xmm2
-    GetEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2);
-#endif // ALL_XARCH_EMITTER_UNIT_TESTS
-    printf("*************** End of genAmd64EmitterUnitTests()\n");
-}
-
-#endif // defined(DEBUG) && defined(LATE_DISASM) && defined(TARGET_AMD64)
-
-#ifdef PROFILING_SUPPORTED
-
-#ifdef TARGET_X86
-
-//-----------------------------------------------------------------------------------
-// genProfilingEnterCallback: Generate the profiling function enter callback.
-//
-// Arguments:
-//     initReg        - register to use as scratch register
-//     pInitRegZeroed - OUT parameter. This variable remains unchanged.
-//
-// Return Value:
-//     None
-//
-// Notes:
-// The x86 profile enter helper has the following requirements (see ProfileEnterNaked in
-// VM\i386\asmhelpers.asm for details):
-// 1. The calling sequence for calling the helper is:
-//          push FunctionIDOrClientID
-//          call ProfileEnterHelper
-// 2. The calling function has an EBP frame.
-// 3. EBP points to the saved ESP which is the first thing saved in the function. Thus,
-//    the following prolog is assumed:
-//          push ESP
-//          mov EBP, ESP
-// 4. All registers are preserved.
-// 5. The helper pops the FunctionIDOrClientID argument from the stack.
-//
-void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
-{
-    assert(compiler->compGeneratingProlog);
-
-    // Give profiler a chance to back out of hooking this method
-    if (!compiler->compIsProfilerHookNeeded())
-    {
-        return;
-    }
-
-    unsigned saveStackLvl2 = genStackLevel;
-
-// Important note: when you change enter probe layout, you must also update SKIP_ENTER_PROF_CALLBACK()
-// for x86 stack unwinding
-
-#if defined(UNIX_X86_ABI)
-    // Manually align the stack to be 16-byte aligned. This is similar to CodeGen::genAlignStackBeforeCall()
-    GetEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_SPBASE, 0xC);
-#endif // UNIX_X86_ABI
-
-    // Push the profilerHandle
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        GetEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
-    }
-
-    // This will emit either
-    // "call ip-relative 32-bit offset" or
-    // "mov rax, helper addr; call rax"
-    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER,
-                      0,           // argSize. Again, we have to lie about it
-                      EA_UNKNOWN); // retSize
-
-    // Check that we have place for the push.
-    assert(compiler->fgGetPtrArgCntMax() >= 1);
-
-#if defined(UNIX_X86_ABI)
-    // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall
-    GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10);
-#endif // UNIX_X86_ABI
-
-    /* Restore the stack level */
-
-    SetStackLevel(saveStackLvl2);
-}
-
-//-----------------------------------------------------------------------------------
-// genProfilingLeaveCallback: Generate the profiling function leave or tailcall callback.
-// Technically, this is not part of the epilog; it is called when we are generating code for a GT_RETURN node.
-//
-// Arguments:
-//     helper - which helper to call. Either CORINFO_HELP_PROF_FCN_LEAVE or CORINFO_HELP_PROF_FCN_TAILCALL
-//
-// Return Value:
-//     None
-//
-// Notes:
-// The x86 profile leave/tailcall helper has the following requirements (see ProfileLeaveNaked and
-// ProfileTailcallNaked in VM\i386\asmhelpers.asm for details):
-// 1. The calling sequence for calling the helper is:
-//          push FunctionIDOrClientID
-//          call ProfileLeaveHelper or ProfileTailcallHelper
-// 2. The calling function has an EBP frame.
-// 3. EBP points to the saved ESP which is the first thing saved in the function. Thus,
-//    the following prolog is assumed:
-//          push ESP
-//          mov EBP, ESP
-// 4. helper == CORINFO_HELP_PROF_FCN_LEAVE: All registers are preserved.
-//    helper == CORINFO_HELP_PROF_FCN_TAILCALL: Only argument registers are preserved.
-// 5. The helper pops the FunctionIDOrClientID argument from the stack.
-//
-void CodeGen::genProfilingLeaveCallback(unsigned helper)
-{
-    assert((helper == CORINFO_HELP_PROF_FCN_LEAVE) || (helper == CORINFO_HELP_PROF_FCN_TAILCALL));
-
-    // Only hook if profiler says it's okay.
-    if (!compiler->compIsProfilerHookNeeded())
-    {
-        return;
-    }
-
-    compiler->info.compProfilerCallback = true;
-
-    // Need to save on to the stack level, since the helper call will pop the argument
-    unsigned saveStackLvl2 = genStackLevel;
-
-#if defined(UNIX_X86_ABI)
-    // Manually align the stack to be 16-byte aligned. This is similar to CodeGen::genAlignStackBeforeCall()
-    GetEmitter()->emitIns_R_I(INS_sub, EA_4BYTE, REG_SPBASE, 0xC);
-    AddStackLevel(0xC);
-    AddNestedAlignment(0xC);
-#endif // UNIX_X86_ABI
-
-    //
-    // Push the profilerHandle
-    //
-
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        GetEmitter()->emitIns_AR_R(INS_push, EA_PTR_DSP_RELOC, REG_NA, REG_NA, (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        inst_IV(INS_push, (size_t)compiler->compProfilerMethHnd);
-    }
-    genSinglePush();
-
-#if defined(UNIX_X86_ABI)
-    int argSize = -REGSIZE_BYTES; // negative means caller-pop (cdecl)
-#else
-    int argSize = REGSIZE_BYTES;
-#endif
-    genEmitHelperCall(helper, argSize, EA_UNKNOWN /* retSize */);
-
-    // Check that we have place for the push.
-    assert(compiler->fgGetPtrArgCntMax() >= 1);
-
-#if defined(UNIX_X86_ABI)
-    // Restoring alignment manually. This is similar to CodeGen::genRemoveAlignmentAfterCall
-    GetEmitter()->emitIns_R_I(INS_add, EA_4BYTE, REG_SPBASE, 0x10);
-    SubtractStackLevel(0x10);
-    SubtractNestedAlignment(0xC);
-#endif // UNIX_X86_ABI
-
-    /* Restore the stack level */
-    SetStackLevel(saveStackLvl2);
-}
-
-#endif // TARGET_X86
-
-
-//-----------------------------------------------------------------------------------
-// genProfilingEnterCallback: Generate the profiling function enter callback.
-//
-// Arguments:
-//     initReg        - register to use as scratch register
-//     pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if
-//                      this call sets 'initReg' to a non-zero value.
-//
-// Return Value:
-//     None
-//
-void CodeGen::genProfilingEnterCallback(regNumber initReg, bool* pInitRegZeroed)
-{
-    assert(compiler->compGeneratingProlog);
-
-    // Give profiler a chance to back out of hooking this method
-    if (!compiler->compIsProfilerHookNeeded())
-    {
-        return;
-    }
-
-#if !defined(UNIX_AMD64_ABI)
-
-    unsigned   varNum;
-    LclVarDsc* varDsc;
-
-    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
-    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
-    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
-
-    // Home all arguments passed in arg registers (RCX, RDX, R8 and R9).
-    // In case of vararg methods, arg regs are already homed.
-    //
-    // Note: Here we don't need to worry about updating gc'info since enter
-    // callback is generated as part of prolog which is non-gc interruptible.
-    // Moreover GC cannot kick while executing inside profiler callback which is a
-    // profiler requirement so it can examine arguments which could be obj refs.
-    if (!compiler->info.compIsVarArgs)
-    {
-        for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
-        {
-            noway_assert(varDsc->lvIsParam);
-
-            if (!varDsc->lvIsRegArg)
-            {
-                continue;
-            }
-
-            var_types storeType = varDsc->lvaArgType();
-            regNumber argReg    = varDsc->GetArgReg();
-
-            instruction store_ins = ins_Store(storeType);
-
-#ifdef FEATURE_SIMD
-            if ((storeType == TYP_SIMD8) && genIsValidIntReg(argReg))
-            {
-                store_ins = INS_mov;
-            }
-#endif // FEATURE_SIMD
-
-            GetEmitter()->emitIns_S_R(store_ins, emitTypeSize(storeType), argReg, varNum, 0);
-        }
-    }
-
-    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
-    // RCX = ProfilerMethHnd
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        // Profiler hooks enabled during Ngen time.
-        // Profiler handle needs to be accessed through an indirection of a pointer.
-        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        // No need to record relocations, if we are generating ELT hooks under the influence
-        // of COMPlus_JitELTHookEnabled=1
-        if (compiler->opts.compJitELTHookEnabled)
-        {
-            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
-        }
-        else
-        {
-            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-        }
-    }
-
-    // RDX = caller's SP
-    // Notes
-    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
-    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
-    //      of that offset to FramePointer to obtain caller's SP value.
-    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
-    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
-    GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
-
-    // This will emit either
-    // "call ip-relative 32-bit offset" or
-    // "mov rax, helper addr; call rax"
-    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN);
-
-    // TODO-AMD64-CQ: Rather than reloading, see if this could be optimized by combining with prolog
-    // generation logic that moves args around as required by first BB entry point conditions
-    // computed by LSRA.  Code pointers for investigating this further: genFnPrologCalleeRegArgs()
-    // and genEnregisterIncomingStackArgs().
-    //
-    // Now reload arg registers from home locations.
-    // Vararg methods:
-    //   - we need to reload only known (i.e. fixed) reg args.
-    //   - if floating point type, also reload it into corresponding integer reg
-    for (varNum = 0, varDsc = compiler->lvaTable; varNum < compiler->info.compArgsCount; varNum++, varDsc++)
-    {
-        noway_assert(varDsc->lvIsParam);
-
-        if (!varDsc->lvIsRegArg)
-        {
-            continue;
-        }
-
-        var_types loadType = varDsc->lvaArgType();
-        regNumber argReg   = varDsc->GetArgReg();
-
-        instruction load_ins = ins_Load(loadType);
-
-#ifdef FEATURE_SIMD
-        if ((loadType == TYP_SIMD8) && genIsValidIntReg(argReg))
-        {
-            load_ins = INS_mov;
-        }
-#endif // FEATURE_SIMD
-
-        GetEmitter()->emitIns_R_S(load_ins, emitTypeSize(loadType), argReg, varNum, 0);
-
-#if FEATURE_VARARG
-        if (compiler->info.compIsVarArgs && varTypeIsFloating(loadType))
-        {
-            regNumber   intArgReg = compiler->getCallArgIntRegister(argReg);
-            instruction ins       = ins_CopyFloatToInt(loadType, TYP_LONG);
-            inst_RV_RV(ins, argReg, intArgReg, loadType);
-        }
-#endif //  FEATURE_VARARG
-    }
-
-    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
-    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
-    {
-        *pInitRegZeroed = false;
-    }
-
-#else // !defined(UNIX_AMD64_ABI)
-
-    // Emit profiler EnterCallback(ProfilerMethHnd, caller's SP)
-    // R14 = ProfilerMethHnd
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        // Profiler hooks enabled during Ngen time.
-        // Profiler handle needs to be accessed through an indirection of a pointer.
-        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_PROFILER_ENTER_ARG_0,
-                                   (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        // No need to record relocations, if we are generating ELT hooks under the influence
-        // of COMPlus_JitELTHookEnabled=1
-        if (compiler->opts.compJitELTHookEnabled)
-        {
-            genSetRegToIcon(REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
-        }
-        else
-        {
-            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_PROFILER_ENTER_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-        }
-    }
-
-    // R15 = caller's SP
-    // Notes
-    //   1) Here we can query caller's SP offset since prolog will be generated after final frame layout.
-    //   2) caller's SP relative offset to FramePointer will be negative.  We need to add absolute value
-    //      of that offset to FramePointer to obtain caller's SP value.
-    assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
-    int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
-    GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_PROFILER_ENTER_ARG_1, genFramePointerReg(), -callerSPOffset);
-
-    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
-    // We use R11 here. This will emit either
-    // "call ip-relative 32-bit offset" or
-    // "mov r11, helper addr; call r11"
-    genEmitHelperCall(CORINFO_HELP_PROF_FCN_ENTER, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
-
-    // If initReg is one of RBM_CALLEE_TRASH, then it needs to be zero'ed before using.
-    if ((RBM_CALLEE_TRASH & genRegMask(initReg)) != 0)
-    {
-        *pInitRegZeroed = false;
-    }
-
-#endif // !defined(UNIX_AMD64_ABI)
-}
-
-//-----------------------------------------------------------------------------------
-// genProfilingLeaveCallback: Generate the profiling function leave or tailcall callback.
-// Technically, this is not part of the epilog; it is called when we are generating code for a GT_RETURN node.
-//
-// Arguments:
-//     helper - which helper to call. Either CORINFO_HELP_PROF_FCN_LEAVE or CORINFO_HELP_PROF_FCN_TAILCALL
-//
-// Return Value:
-//     None
-//
-void CodeGen::genProfilingLeaveCallback(unsigned helper)
-{
-    assert((helper == CORINFO_HELP_PROF_FCN_LEAVE) || (helper == CORINFO_HELP_PROF_FCN_TAILCALL));
-
-    // Only hook if profiler says it's okay.
-    if (!compiler->compIsProfilerHookNeeded())
-    {
-        return;
-    }
-
-    compiler->info.compProfilerCallback = true;
-
-#if !defined(UNIX_AMD64_ABI)
-
-    // Since the method needs to make a profiler callback, it should have out-going arg space allocated.
-    noway_assert(compiler->lvaOutgoingArgSpaceVar != BAD_VAR_NUM);
-    noway_assert(compiler->lvaOutgoingArgSpaceSize >= (4 * REGSIZE_BYTES));
-
-    // If thisPtr needs to be kept alive and reported, it cannot be one of the callee trash
-    // registers that profiler callback kills.
-    if (compiler->lvaKeepAliveAndReportThis() && compiler->lvaTable[compiler->info.compThisArg].lvIsInReg())
-    {
-        regMaskTP thisPtrMask = genRegMask(compiler->lvaTable[compiler->info.compThisArg].GetRegNum());
-        noway_assert((RBM_PROFILER_LEAVE_TRASH & thisPtrMask) == 0);
-    }
-
-    // At this point return value is computed and stored in RAX or XMM0.
-    // On Amd64, Leave callback preserves the return register.  We keep
-    // RAX alive by not reporting as trashed by helper call.  Also note
-    // that GC cannot kick-in while executing inside profiler callback,
-    // which is a requirement of profiler as well since it needs to examine
-    // return value which could be an obj ref.
-
-    // RCX = ProfilerMethHnd
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        // Profiler hooks enabled during Ngen time.
-        // Profiler handle needs to be accessed through an indirection of an address.
-        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        // Don't record relocations, if we are generating ELT hooks under the influence
-        // of COMPlus_JitELTHookEnabled=1
-        if (compiler->opts.compJitELTHookEnabled)
-        {
-            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
-        }
-        else
-        {
-            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-        }
-    }
-
-    // RDX = caller's SP
-    // TODO-AMD64-Cleanup: Once we start doing codegen after final frame layout, retain the "if" portion
-    // of the stmnts to execute unconditionally and clean-up rest.
-    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
-    {
-        // Caller's SP relative offset to FramePointer will be negative.  We need to add absolute
-        // value of that offset to FramePointer to obtain caller's SP value.
-        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
-        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
-    }
-    else
-    {
-        // If we are here means that it is a tentative frame layout during which we
-        // cannot use caller's SP offset since it is an estimate.  For now we require the
-        // method to have at least a single arg so that we can use it to obtain caller's
-        // SP.
-        LclVarDsc* varDsc = compiler->lvaTable;
-        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
-
-        // lea rdx, [FramePointer + Arg0's offset]
-        GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
-    }
-
-    // We can use any callee trash register (other than RAX, RCX, RDX) for call target.
-    // We use R8 here. This will emit either
-    // "call ip-relative 32-bit offset" or
-    // "mov r8, helper addr; call r8"
-    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_ARG_2);
-
-#else // !defined(UNIX_AMD64_ABI)
-
-    // RDI = ProfilerMethHnd
-    if (compiler->compProfilerMethHndIndirected)
-    {
-        GetEmitter()->emitIns_R_AI(INS_mov, EA_PTR_DSP_RELOC, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-    }
-    else
-    {
-        if (compiler->opts.compJitELTHookEnabled)
-        {
-            genSetRegToIcon(REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd, TYP_I_IMPL);
-        }
-        else
-        {
-            instGen_Set_Reg_To_Imm(EA_8BYTE, REG_ARG_0, (ssize_t)compiler->compProfilerMethHnd);
-        }
-    }
-
-    // RSI = caller's SP
-    if (compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
-    {
-        int callerSPOffset = compiler->lvaToCallerSPRelativeOffset(0, isFramePointerUsed());
-        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_ARG_1, genFramePointerReg(), -callerSPOffset);
-    }
-    else
-    {
-        LclVarDsc* varDsc = compiler->lvaTable;
-        NYI_IF((varDsc == nullptr) || !varDsc->lvIsParam, "Profiler ELT callback for a method without any params");
-
-        // lea rdx, [FramePointer + Arg0's offset]
-        GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, REG_ARG_1, 0, 0);
-    }
-
-    // We can use any callee trash register (other than RAX, RDI, RSI) for call target.
-    // We use R11 here. This will emit either
-    // "call ip-relative 32-bit offset" or
-    // "mov r11, helper addr; call r11"
-    genEmitHelperCall(helper, 0, EA_UNKNOWN, REG_DEFAULT_PROFILER_CALL_TARGET);
-
-#endif // !defined(UNIX_AMD64_ABI)
-}
-
-
-#endif // PROFILING_SUPPORTED
-
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 581dc7dddb44..885c1f602783 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -204,7 +204,7 @@ void Compiler::compDspSrcLinesByLineNum(unsigned line, bool seek)
 }
 
 /*****************************************************************************/
-
+#ifndef TARGET_WASM
 void Compiler::compDspSrcLinesByNativeIP(UNATIVE_OFFSET curIP)
 {
     static IPmappingDsc* nextMappingDsc;
@@ -262,6 +262,7 @@ void Compiler::compDspSrcLinesByNativeIP(UNATIVE_OFFSET curIP)
         }
     }
 }
+#endif // TARGET_WASM
 
 /*****************************************************************************/
 #endif // DEBUG
@@ -830,7 +831,7 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
             // Arm64 Windows VarArg methods arguments will not classify HFA/HVA types, they will need to be treated
             // as if they are not HFA/HVA types.
             var_types hfaType;
-#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WINDOWS) && defined(TARGET_ARM64) || defined(TARGET_WASM)
             if (isVarArg)
             {
                 hfaType = TYP_UNDEF;
@@ -923,7 +924,7 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
             howToPassStruct = SPK_ByValue;
             useType         = TYP_STRUCT;
 
-#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined (TARGET_WASM32) || defined(TARGET_WASM64) // TODO: WASM can in theory pass any size struct as an arg.
+#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM) // TODO: WASM can in theory pass any size struct as an arg.
 
             // Otherwise we pass this struct by reference to a copy
             // setup wbPassType and useType indicate that this is passed using one register (by reference to a copy)
@@ -948,6 +949,61 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
     return useType;
 }
 
+#ifdef TARGET_WASM
+bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
+{
+    return false; // TODO WASM
+}
+var_types Compiler::GetHfaType(GenTree* tree)
+{
+    return TYP_UNDEF; // TODO WASM
+}
+var_types Compiler::GetHfaType(CORINFO_CLASS_HANDLE hClass)
+{
+    return TYP_UNDEF;
+}
+//------------------------------------------------------------------------
+// GetHfaCount: Given a  class handle for an HFA struct
+//    return the number of registers needed to hold the HFA
+//
+//    Note that on ARM32 the single precision registers overlap with
+//        the double precision registers and for that reason each
+//        double register is considered to be two single registers.
+//        Thus for ARM32 an HFA of 4 doubles this function will return 8.
+//    On ARM64 given an HFA of 4 singles or 4 doubles this function will
+//         will return 4 for both.
+// Arguments:
+//    hClass: the class handle of a HFA struct
+//
+unsigned Compiler::GetHfaCount(CORINFO_CLASS_HANDLE hClass)
+{
+    assert(false); // TODO
+    //assert(IsHfa(hClass));
+    //var_types hfaType = GetHfaType(hClass);
+    //unsigned  classSize = info.compCompHnd->getClassSize(hClass);
+    //// Note that the retail build issues a warning about a potential divsion by zero without the Max function
+    //unsigned elemSize = Max((unsigned)1, (unsigned)EA_SIZE_IN_BYTES(emitActualTypeSize(hfaType)));
+    //return classSize / elemSize;
+    return 1;
+}
+
+IL_OFFSET jitGetILoffs(IL_OFFSETX offsx)
+{
+    assert(offsx != BAD_IL_OFFSET);
+
+    switch ((int)offsx) // Need the cast since offs is unsigned and the case statements are comparing to signed.
+    {
+    case ICorDebugInfo::NO_MAPPING:
+    case ICorDebugInfo::PROLOG:
+    case ICorDebugInfo::EPILOG:
+        unreached();
+
+    default:
+        return IL_OFFSET(offsx & ~IL_OFFSETX_BITS);
+    }
+}
+#endif //TARGET_WASM
+
 //-----------------------------------------------------------------------------
 // getReturnTypeForStruct:
 //     Get the type that is used to return values of the given struct type.
@@ -1445,8 +1501,9 @@ void Compiler::compStartup()
 #endif
 
     /* Initialize the emitter */
-
+#ifndef TARGET_WASM
     emitter::emitInit();
+#endif // !TARGET_WASM
 
     // Static vars of ValueNumStore
     ValueNumStore::InitValueNumStoreStatics();
@@ -1480,9 +1537,11 @@ void Compiler::compShutdown()
     DisplayNowayAssertMap();
 #endif // MEASURE_NOWAY
 
+#ifndef TARGET_WASM
     /* Shut down the emitter */
 
     emitter::emitDone();
+#endif // !TARGET_WASM
 
 #if defined(DEBUG) || defined(INLINE_DATA)
     // Finish reading and/or writing inline xml
@@ -1928,7 +1987,9 @@ void Compiler::compInit(ArenaAllocator*       pAlloc,
 
     if (!compIsForInlining())
     {
+#ifndef TARGET_WASM
         codeGen = getCodeGenerator(this);
+#endif // !TARGET_WASM
         optInit();
         hashBv::Init(this);
 
@@ -2686,7 +2747,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     else
     {
         verbose = false;
+#ifndef TARGET_WASM
         codeGen->setVerbose(false);
+#endif // !TARGET_WASM
     }
     verboseTrees     = verbose && shouldUseVerboseTrees();
     verboseSsa       = verbose && shouldUseVerboseSsa();
@@ -3143,7 +3206,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
         verbose         = true;
         verboseTrees    = shouldUseVerboseTrees();
         verboseSsa      = shouldUseVerboseSsa();
+#ifndef TARGET_WASM
         codeGen->setVerbose(true);
+#endif // !TARGET_WASM
     }
 
     treesBeforeAfterMorph = (JitConfig.TreesBeforeAfterMorph() == 1);
@@ -3203,7 +3268,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 //-------------------------------------------------------------------------
 
 #ifdef DEBUG
+#ifndef TARGET_WASM
     assert(!codeGen->isGCTypeFixed());
+#endif // !TARGET_WASM
     opts.compGcChecks = (JitConfig.JitGCChecks() != 0) || compStressCompile(STRESS_GENERIC_VARN, 5);
 #endif
 
@@ -3966,6 +4033,7 @@ void Compiler::compSetOptimizationLevel()
         opts.compFlags |= CLFLG_MINOPT;
     }
 
+#ifndef TARGET_WASM
     if (!compIsForInlining())
     {
         codeGen->setFramePointerRequired(false);
@@ -3998,6 +4066,7 @@ void Compiler::compSetOptimizationLevel()
             codeGen->SetAlignLoops(JitConfig.JitAlignLoops() == 1);
         }
     }
+#endif // !TARGET_WASM
 
     fgCanRelocateEHRegions = true;
 }
@@ -4371,7 +4440,7 @@ void Compiler::EndPhase(Phases phase)
     mostRecentlyActivePhase = phase;
 }
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
 inline void DoLlvmPhase(Compiler* _compiler)
 {
     fatal(CORJIT_SKIPPED);
@@ -4512,6 +4581,7 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
     // Note that requiring a EBP Frame disallows double alignment.  Thus if we change this
     // we either have to disallow double alignment for E&C some other way or handle it in EETwain.
 
+#ifndef TARGET_WASM
     if (opts.compDbgEnC)
     {
         codeGen->setFramePointerRequired(true);
@@ -4522,6 +4592,7 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
         //
         // compLocallocUsed            = true;
     }
+#endif // !TARGET_WASM
 
     // Start phases that are broadly called morphing, and includes
     // global morph, as well as other phases that massage the trees so
@@ -4780,8 +4851,10 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
         }
 #endif // defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM)
 
+#ifndef TARGET_WASM
         // Decide the kind of code we want to generate
         fgSetOptions();
+#endif !TARGET_WASM
 
         fgExpandQmarkNodes();
 
@@ -5054,11 +5127,10 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
     Rationalizer rat(this); // PHASE_RATIONALIZE
     rat.Run();
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
     // TODO:after rat, but better before?
     DoLlvmPhase(this); // DoPhase?
-    return;
-#endif
+#else
 
     // Here we do "simple lowering".  When the RyuJIT backend works for all
     // platforms, this will be part of the more general lowering phase.  For now, though, we do a separate
@@ -5172,8 +5244,11 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
         fprintf(compJitFuncInfoFile, ""); // in our logic this causes a flush
     }
 #endif // FUNC_INFO_LOGGING
+#endif // TARGET_WASM
+
 }
 
+#ifndef TARGET_WASM
 //------------------------------------------------------------------------
 // generatePatchpointInfo: allocate and fill in patchpoint info data,
 //    and report it to the VM
@@ -5253,6 +5328,7 @@ void Compiler::generatePatchpointInfo()
     // Register this with the runtime.
     info.compCompHnd->setPatchpointInfo(patchpointInfo);
 }
+#endif // !TARGET_WASM
 
 //------------------------------------------------------------------------
 // ResetOptAnnotations: Clear annotations produced during global optimizations.
@@ -5666,9 +5742,11 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
             goto DoneCleanUp;
         }
 
+#ifndef TARGET_WASM
         /* Tell the emitter that we're done with this function */
 
         GetEmitter()->emitEndCG();
+#endif // !TARGET_WASM
 
     DoneCleanUp:
         compDone();
@@ -6182,12 +6260,14 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr,
     compBasicBlockID = 0;
 #endif
 
+#ifndef TARGET_WASM
     /* Initialize emitter */
 
     if (!compIsForInlining())
     {
         codeGen->GetEmitter()->emitBegCG(this, compHnd);
     }
+#endif // !TARGET_WASM
 
     info.compIsStatic = (info.compFlags & CORINFO_FLG_STATIC) != 0;
 
@@ -8547,6 +8627,7 @@ void cEH(Compiler* comp)
     comp->fgDispHandlerTab();
 }
 
+#ifndef TARGET_WASM
 void cVar(Compiler* comp, unsigned lclNum)
 {
     static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called
@@ -8575,6 +8656,7 @@ void cVarsFinal(Compiler* comp)
     printf("===================================================================== *Vars %u\n", sequenceNumber++);
     comp->lvaTableDump(Compiler::FINAL_FRAME_LAYOUT);
 }
+#endif !TARGET_WASM
 
 void cBlockCheapPreds(Compiler* comp, BasicBlock* block)
 {
@@ -8683,6 +8765,7 @@ void dEH()
     cEH(JitTls::GetCompiler());
 }
 
+#ifndef TARGET_WASM
 void dVar(unsigned lclNum)
 {
     cVar(JitTls::GetCompiler(), lclNum);
@@ -8702,6 +8785,7 @@ void dVarsFinal()
 {
     cVarsFinal(JitTls::GetCompiler());
 }
+#endif !TARGET_WASM
 
 void dBlockPreds(BasicBlock* block)
 {
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 894f567a1f2f..554acf735a32 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2564,7 +2564,9 @@ class Compiler
     }
 
     void* ehEmitCookie(BasicBlock* block);
+#ifndef TARGET_WASM
     UNATIVE_OFFSET ehCodeOffset(BasicBlock* block);
+#endif // !TARGET_WASM
 
     EHblkDsc* ehInitHndRange(BasicBlock* src, IL_OFFSET* hndBeg, IL_OFFSET* hndEnd, bool* inFilter);
 
@@ -3416,8 +3418,10 @@ class Compiler
     void lvaDumpRegLocation(unsigned lclNum);
     void lvaDumpFrameLocation(unsigned lclNum);
     void lvaDumpEntry(unsigned lclNum, FrameLayoutState curState, size_t refCntWtdWidth = 6);
+#ifndef TARGET_WASM
     void lvaTableDump(FrameLayoutState curState = NO_FRAME_LAYOUT); // NO_FRAME_LAYOUT means use the current frame
                                                                     // layout state defined by lvaDoneFrameLayout
+#endif //!TARGET_WASM
 #endif
 
 // Limit frames size to 1GB. The maximum is 2GB in theory - make it intentionally smaller
@@ -4733,7 +4737,9 @@ class Compiler
 
     bool fgMorphBlockStmt(BasicBlock* block, Statement* stmt DEBUGARG(const char* msg));
 
+#ifndef TARGET_WASM
     void fgSetOptions();
+#endif !TARGET_WASM
 
 #ifdef DEBUG
     static fgWalkPreFn fgAssertNoQmark;
@@ -7325,8 +7331,9 @@ class Compiler
     */
 
 public:
+#ifndef TARGET_WASM
     regNumber raUpdateRegStateForArg(RegState* regState, LclVarDsc* argDsc);
-
+#endif
     void raMarkStkVars();
 
 protected:
@@ -7530,7 +7537,7 @@ class Compiler
 #elif defined(TARGET_ARM64)
             reg     = REG_R11;
             regMask = RBM_R11;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)  //TODO: empty better?
+#elif defined(TARGET_WASM)  //TODO: empty better?
             if (isCoreRTABI)
             {
                 reg = REG_R10;
@@ -7605,6 +7612,7 @@ class Compiler
 
     unsigned eeVarsCount;
 
+#ifndef TARGET_WASM
     struct VarResultInfo
     {
         UNATIVE_OFFSET             startOffset;
@@ -7619,6 +7627,7 @@ class Compiler
                      unsigned                          varNum,
                      const CodeGenInterface::siVarLoc& loc);
     void eeSetLVdone();
+#endif
 
 #ifdef DEBUG
     void eeDispVar(ICorDebugInfo::NativeVarInfo* var);
@@ -7705,7 +7714,7 @@ class Compiler
     CodeGenInterface* codeGen;
 
     //  The following holds information about instr offsets in terms of generated code.
-
+#ifndef TARGET_WASM
     struct IPmappingDsc
     {
         IPmappingDsc* ipmdNext;      // next line# record
@@ -7713,11 +7722,11 @@ class Compiler
         IL_OFFSETX    ipmdILoffsx;   // the instr offset
         bool          ipmdIsLabel;   // Can this code be a branch label?
     };
-
     // Record the instr offset mapping to the generated code
 
     IPmappingDsc* genIPmappingList;
     IPmappingDsc* genIPmappingLast;
+#endif
 
     // Managed RetVal - A side hash table meant to record the mapping from a
     // GT_CALL node to its IL offset.  This info is used to emit sequence points
@@ -7740,6 +7749,7 @@ class Compiler
     // convenience and backward compatibility, but the properties can only be set by invoking
     // the setter on CodeGenContext directly.
 
+#ifndef TARGET_WASM
     emitter* GetEmitter() const
     {
         return codeGen->GetEmitter();
@@ -7749,14 +7759,21 @@ class Compiler
     {
         return codeGen->isFramePointerUsed();
     }
-
+#endif
     bool GetInterruptible()
     {
+#ifdef TARGET_WASM
+        return false;
+#else
         return codeGen->GetInterruptible();
+#endif
     }
     void SetInterruptible(bool value)
     {
+#ifndef TARGET_WASM
         codeGen->SetInterruptible(value);
+#else
+#endif // !TARGET_WASM
     }
 
 #ifdef TARGET_ARMARCH
@@ -7786,11 +7803,18 @@ class Compiler
 
     bool IsFullPtrRegMapRequired()
     {
+#ifndef TARGET_WASM
         return codeGen->IsFullPtrRegMapRequired();
+#else
+        return false; // For GCInfo TODO: sensible default?
+#endif // TARGET_WASM
     }
     void SetFullPtrRegMapRequired(bool value)
     {
+#ifndef TARGET_WASM
         codeGen->SetFullPtrRegMapRequired(value);
+#else
+#endif // TARGET_WASM
     }
 
 // Things that MAY belong either in CodeGen or CodeGenContext
@@ -7988,7 +8012,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
     UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func);
 
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: delete?
+#if defined(TARGET_AMD64) || defined(TARGET_WASM) // TODO: delete?
 
     void unwindBegPrologWindows();
     void unwindPushWindows(regNumber reg);
@@ -8045,7 +8069,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // Get highest available level for SIMD codegen
     SIMDLevel getSIMDSupportLevel()
     {
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
         if (compOpportunisticallyDependsOn(InstructionSet_AVX2))
         {
             return SIMD_AVX2_Supported;
@@ -8058,6 +8082,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
         // min bar is SSE2
         return SIMD_SSE2_Supported;
+#elif defined(TARGET_WASM)
+        assert(!"WASM supports SIMD so what to do here?");
+        return SIMD_Not_Supported;
 #else
         assert(!"Available instruction set(s) for SIMD codegen is not defined for target arch");
         unreached();
@@ -9637,7 +9664,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // In case of Amd64 this doesn't include float regs saved on stack.
     unsigned compCalleeRegsPushed;
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
     // Mask of callee saved float regs on stack.
     regMaskTP compCalleeFPRegsSavedMask;
 #endif
@@ -11239,7 +11266,7 @@ const instruction INS_SQRT = INS_fsqrt;
 
 #endif // TARGET_ARM64
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
 
 const instruction INS_SHIFT_LEFT_LOGICAL = INS_shl;
 const instruction INS_SHIFT_RIGHT_LOGICAL = INS_shr;
@@ -11258,14 +11285,18 @@ const instruction INS_ADDC = INS_adc;
 const instruction INS_SUBC = INS_sbb;
 const instruction INS_NOT = INS_not;
 
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#endif // defined(TARGET_WASM)
 
 /*****************************************************************************/
 
+#ifndef TARGET_WASM
 extern const BYTE genTypeSizes[];
+#endif //!TARGET_WASM
 extern const BYTE genTypeAlignments[];
+#ifndef TARGET_WASM
 extern const BYTE genTypeStSzs[];
 extern const BYTE genActualTypes[];
+#endif //!TARGET_WASM
 
 /*****************************************************************************/
 
diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp
index d7576d6b4d60..c7f4f97dfb89 100644
--- a/src/coreclr/jit/compiler.hpp
+++ b/src/coreclr/jit/compiler.hpp
@@ -2034,6 +2034,7 @@ inline int Compiler::lvaCachedGenericContextArgOffset()
     return lvaCachedGenericContextArgOffs;
 }
 
+#ifndef TARGET_WASM
 //------------------------------------------------------------------------
 // lvaFrameAddress: Determine the stack frame offset of the given variable,
 // and how to generate an address to that stack frame.
@@ -2244,6 +2245,7 @@ inline
 
     return varOffset;
 }
+#endif
 
 inline bool Compiler::lvaIsParameter(unsigned varNum)
 {
diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp
index c41164a75ecd..fdee523d5bba 100644
--- a/src/coreclr/jit/ee_il_dll.cpp
+++ b/src/coreclr/jit/ee_il_dll.cpp
@@ -571,6 +571,7 @@ void Compiler::eeGetStmtOffsets()
     info.compCompHnd->freeArray(offsets);
 }
 
+#ifndef TARGET_WASM
 /*****************************************************************************
  *
  *                  Debugging support - Local var info
@@ -632,6 +633,7 @@ void Compiler::eeSetLVdone()
 
     eeVars = nullptr; // We give up ownership after setVars()
 }
+#endif // !TARGET_WASM
 
 void Compiler::eeGetVars()
 {
@@ -759,6 +761,7 @@ void Compiler::eeGetVars()
 #endif // DEBUG
 }
 
+#ifndef TARGET_WASM
 #ifdef DEBUG
 void Compiler::eeDispVar(ICorDebugInfo::NativeVarInfo* var)
 {
@@ -854,6 +857,7 @@ void Compiler::eeDispVar(ICorDebugInfo::NativeVarInfo* var)
 
     printf("\n");
 }
+#endif // !TARGET_WASM
 
 // Same parameters as ICorStaticInfo::setVars().
 void Compiler::eeDispVars(CORINFO_METHOD_HANDLE ftn, ULONG32 cVars, ICorDebugInfo::NativeVarInfo* vars)
@@ -1108,6 +1112,7 @@ WORD Compiler::eeGetRelocTypeHint(void* target)
     }
 }
 
+#ifndef TARGET_WASM
 CORINFO_FIELD_HANDLE Compiler::eeFindJitDataOffs(unsigned dataOffs)
 {
     // Data offsets are marked by the fact that the low two bits are 0b01 0x1
@@ -1145,6 +1150,7 @@ int Compiler::eeGetJitDataOffs(CORINFO_FIELD_HANDLE field)
         return -1;
     }
 }
+#endif // !TARGET_WASM
 
 /*****************************************************************************
  *
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index d136b4b570fc..2d252ef0cf0f 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -9,7 +9,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-
+#ifndef TARGET_WASM
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -2172,7 +2172,7 @@ bool emitter::emitHasEpilogEnd()
 
 #endif // JIT32_GCENCODER
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#ifdef TARGET_XARCH
 
 /*****************************************************************************
  *
@@ -3274,9 +3274,6 @@ const size_t hexEncodingSize = 19;
 #elif defined(TARGET_ARM)
 const size_t basicIndent     = 12;
 const size_t hexEncodingSize = 11;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
-const size_t basicIndent = 7;
-const size_t hexEncodingSize = 21;
 #endif
 
 #ifdef DEBUG
@@ -4477,8 +4474,6 @@ void emitter::emitJumpDistBind()
         // The size of IF_LARGEJMP/IF_LARGEADR/IF_LARGELDC are 8 or 12.
         // All other code size is 4.
         assert((sizeDif == 4) || (sizeDif == 8));
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-        jmp->idCodeSize(jsz);
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -5913,8 +5908,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #elif defined(TARGET_ARM64)
                     assert(!jmp->idAddr()->iiaHasInstrCount());
                     emitOutputLJ(NULL, adr, jmp);
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-                    * (BYTE*)adr -= (BYTE)adj;
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -5923,7 +5916,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                 {
                     // Patch Forward non-Short Jump
                     CLANG_FORMAT_COMMENT_ANCHOR;
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH)
                     *(int*)adr -= adj;
 #elif defined(TARGET_ARMARCH)
                     assert(!jmp->idAddr()->iiaHasInstrCount());
@@ -8542,3 +8535,4 @@ regMaskTP emitter::emitGetGCRegsKilledByNoGCCall(CorInfoHelpFunc helper)
 
     return result;
 }
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 75da1c222b26..a46c077ac0f0 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1,7 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 /*****************************************************************************/
-
+#ifndef TARGET_WASM
 #ifndef _EMIT_H_
 #define _EMIT_H_
 
@@ -576,10 +576,6 @@ class emitter
 #elif defined(TARGET_ARM64)
         static_assert_no_msg(INS_count <= 512);
         instruction _idIns : 9;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO ??
-        static_assert_no_msg(INS_count <= 1024);
-        instruction _idIns : 10;
-#define MAX_ENCODED_SIZE 15
 #else  // !(defined(TARGET_XARCH) || defined(TARGET_ARM64))
         static_assert_no_msg(INS_count <= 256);
         instruction _idIns : 8;
@@ -589,9 +585,6 @@ class emitter
 #if defined(TARGET_XARCH)
         static_assert_no_msg(IF_COUNT <= 128);
         insFormat _idInsFmt : 7;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-        static_assert_no_msg(IF_COUNT <= 128);
-        insFormat _idInsFmt : 7;
 #else
         static_assert_no_msg(IF_COUNT <= 256);
         insFormat _idInsFmt : 8;
@@ -642,12 +635,6 @@ class emitter
                                   // doesn't cross a byte boundary.
 #elif defined(TARGET_ARM64)
 // Moved the definition of '_idOpSize' later so that we don't cross a 32-bit boundary when laying out bitfields
-
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: wasm?
-        unsigned _idCodeSize : 4; // size of instruction in bytes. Max size of an Intel instruction is 15 bytes.
-        opSize   _idOpSize : 3;   // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16, 5=32
-                                  // At this point we have fully consumed first DWORD so that next field
-                                  // doesn't cross a byte boundary.
 #else  // ARM
         opSize      _idOpSize : 2; // operand size: 0=1 , 1=2 , 2=4 , 3=8
 #endif // ARM
@@ -714,9 +701,6 @@ class emitter
 #elif defined(TARGET_XARCH)
                                    // For xarch, we have used 14 bits from the second DWORD.
 #define ID_EXTRA_BITFIELD_BITS (14)
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-                                   // TODO: delete?
-#define ID_EXTRA_BITFIELD_BITS (14)
 #else
 #error Unsupported or unset target architecture
 #endif
@@ -868,13 +852,7 @@ class emitter
                 regNumber _idReg3 : REGNUM_BITS;
                 regNumber _idReg4 : REGNUM_BITS;
             };
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: delete?
-            struct
-            {
-                regNumber _idReg3 : REGNUM_BITS;
-                regNumber _idReg4 : REGNUM_BITS;
-            };
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#endif // defined(TARGET_XARCH)
 
         } _idAddrUnion;
 
@@ -972,29 +950,7 @@ class emitter
             _idInsFlags = sf;
             assert(sf == _idInsFlags);
         }
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-
-        unsigned idCodeSize() const
-        {
-            return _idCodeSize;
-        }
-        void idCodeSize(unsigned sz)
-        {
-            if (sz > 15)
-            {
-                // This is a temporary workaround for non-precise instr size
-                // estimator on XARCH. It often overestimates sizes and can
-                // return value more than 15 that doesn't fit in 4 bits _idCodeSize.
-                // If somehow we generate instruction that needs more than 15 bytes we
-                // will fail on another assert in emit.cpp: noway_assert(id->idCodeSize() >= csz).
-                // Issue https://github.com/dotnet/runtime/issues/12840.
-                sz = 15;
-            }
-            assert(sz <= 15); // Intel decoder limit.
-            _idCodeSize = sz;
-            assert(sz == _idCodeSize);
-        }
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#endif // TARGET_ARM
 
         emitAttr idOpSize()
         {
@@ -1362,24 +1318,6 @@ class emitter
 #define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_1C
 #define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_4C
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-
-// a read,write or modify from stack location, possible def to use latency from L0 cache
-#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C
-#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C
-#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_5C
-
-// a read, write or modify from constant location, possible def to use latency from L0 cache
-#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_2C
-#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_2C
-#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_5C
-
-// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
-// plus an extra cost  (of 1.0) for a increased chance  of a cache miss
-#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_3C
-#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_3C
-#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_6C
-
 #endif // TARGET_XXX
 
 // Make this an enum:
@@ -1481,21 +1419,6 @@ class emitter
 
 #endif // TARGET_XARCH
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // copying AMD64
-
-    struct instrDescAmd : instrDesc // large addrmode disp
-    {
-        ssize_t idaAmdVal;
-    };
-
-    struct instrDescCnsAmd : instrDesc // large cons + addrmode disp
-    {
-        ssize_t idacCnsVal;
-        ssize_t idacAmdVal;
-    };
-
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
-
     struct instrDescCGCA : instrDesc // call with ...
     {
         VARSET_TP idcGCvars;    // ... updated GC vars or
@@ -1549,7 +1472,7 @@ class emitter
     size_t emitGetInstrDescSize(const instrDesc* id);
     size_t emitGetInstrDescSizeSC(const instrDesc* id);
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#ifdef TARGET_XARCH
 
     ssize_t emitGetInsCns(instrDesc* id);
     ssize_t emitGetInsDsp(instrDesc* id);
@@ -1623,7 +1546,7 @@ class emitter
     unsigned       emitEpilogCnt;
     UNATIVE_OFFSET emitEpilogSize;
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#ifdef TARGET_XARCH
 
     void           emitStartExitSeq(); // Mark the start of the "return" sequence
     emitLocation   emitExitSeqBegLoc;
@@ -2964,3 +2887,4 @@ inline void emitter::emitEnableGC()
 /*****************************************************************************/
 #endif // _EMIT_H_
 /*****************************************************************************/
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/emitdef.h b/src/coreclr/jit/emitdef.h
index cde967a26fc4..c9f003ccce1b 100644
--- a/src/coreclr/jit/emitdef.h
+++ b/src/coreclr/jit/emitdef.h
@@ -12,8 +12,6 @@
 #include "emitarm.h"
 #elif defined(TARGET_ARM64)
 #include "emitarm64.h"
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
-#include "emitwasm.h"
 #else
 #error Unsupported or unset target architecture
 #endif
diff --git a/src/coreclr/jit/emitfmts.h b/src/coreclr/jit/emitfmts.h
index e9c56bd9bff4..f4a8af429ca7 100644
--- a/src/coreclr/jit/emitfmts.h
+++ b/src/coreclr/jit/emitfmts.h
@@ -8,8 +8,7 @@
 #include "emitfmtsarm.h"
 #elif defined(TARGET_ARM64)
 #include "emitfmtsarm64.h"
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: need anything here?  Try removing
-#include "emitfmtswasm.h"
+#elif defined(TARGET_WASM) // this file included in CMakeList.txt unconditionally
 #else
 #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/emitfmtswasm.h b/src/coreclr/jit/emitfmtswasm.h
deleted file mode 100644
index 09c674ffba30..000000000000
--- a/src/coreclr/jit/emitfmtswasm.h
+++ /dev/null
@@ -1,218 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-//////////////////////////////////////////////////////////////////////////////
-
-//
-//  This file was previously known as emitfmts.h
-//
-
-// clang-format off
-#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
-  #error Unexpected target type
-#endif
-
-#ifdef  DEFINE_ID_OPS
-//////////////////////////////////////////////////////////////////////////////
-
-#undef  DEFINE_ID_OPS
-
-enum    ID_OPS
-{
-    ID_OP_NONE,                             // no additional arguments
-    ID_OP_SCNS,                             // small const  operand (21-bits or less, no reloc)
-    ID_OP_CNS,                              // constant     operand
-    ID_OP_DSP,                              // displacement operand
-    ID_OP_DSP_CNS,                          // displacement + constant
-    ID_OP_AMD,                              // addrmode with dsp
-    ID_OP_AMD_CNS,                          // addrmode with dsp + constant
-    ID_OP_JMP,                              // local jump
-    ID_OP_LBL,                              // label operand
-    ID_OP_CALL,                             // direct method call
-    ID_OP_SPEC,                             // special handling required
-};
-
-//////////////////////////////////////////////////////////////////////////////
-#else // !DEFINE_ID_OPS
-//////////////////////////////////////////////////////////////////////////////
-
-#ifdef  DEFINE_IS_OPS
-#undef  DEFINE_IS_OPS
-
-#else // DEFINE_IS_OPS
-
-//////////////////////////////////////////////////////////////////////////////
-
-#ifndef IF_DEF
-#error  Must define IF_DEF macro before including this file
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// A note on the naming convention for instruction forms (IF_xxxxx).
-// For 3-character code XYY, generally we have:
-//      X =
-//          R - register
-//          M - memory
-//          S - stack
-//          A - address mode
-//      YY =
-//          RD - read
-//          WR - write
-//          RW - read / write
-//
-// The following sequences don't follow this pattern:
-//      XYY =
-//          CNS - constant
-//          SHF - shift-constant
-//
-// For IF_XXX_YYY, the first operand is XXX, the second operand is YYY.
-//
-//////////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// enum insFormat   instruction            enum ID_OPS
-//                  scheduling
-//                  (unused)
-//////////////////////////////////////////////////////////////////////////////
-
-IF_DEF(NONE,        IS_NONE,                    NONE)     // no operands
-
-IF_DEF(LABEL,       IS_NONE,                    JMP )     // label
-IF_DEF(RWR_LABEL,   IS_R1_WR,                   JMP )     // write label to register
-IF_DEF(SWR_LABEL,   IS_SF_WR,                   LBL )     // write label to stack
-
-IF_DEF(METHOD,      IS_NONE,                    CALL)     // method
-IF_DEF(METHPTR,     IS_NONE,                    CALL)     // method ptr (glbl)
-
-IF_DEF(CNS,         IS_NONE,                    SCNS)     // const
-
-//----------------------------------------------------------------------------
-// NOTE: The order of the "RD/WR/RW" varieties must match that of
-//       the "insUpdateModes" enum in "instr.h".
-//----------------------------------------------------------------------------
-
-IF_DEF(RRD,         IS_R1_RD,                   NONE)     // read   reg
-IF_DEF(RWR,         IS_R1_WR,                   NONE)     // write  reg
-IF_DEF(RRW,         IS_R1_RW,                   NONE)     // r/w    reg
-
-IF_DEF(RRD_CNS,     IS_R1_RD,                   SCNS)     // read   reg , const
-IF_DEF(RWR_CNS,     IS_R1_WR,                   SCNS)     // write  reg , const
-IF_DEF(RRW_CNS,     IS_R1_RW,                   SCNS)     // r/w    reg , const
-IF_DEF(RRW_SHF,     IS_R1_RW,                   SCNS)     // r/w    reg , shift-const
-
-IF_DEF(RRD_RRD,     IS_R1_RD|IS_R2_RD,          NONE)     // read   reg , read reg2
-IF_DEF(RWR_RRD,     IS_R1_WR|IS_R2_RD,          NONE)     // write  reg , read reg2
-IF_DEF(RRW_RRD,     IS_R1_RW|IS_R2_RD,          NONE)     // r/w    reg , read reg2
-IF_DEF(RRW_RRW,     IS_R1_RW|IS_R2_RW,          NONE)     // r/w    reg , r/w reg2 - for XCHG reg, reg2
-IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW,          SCNS)     // r/w    reg , r/w  reg2 , const
-
-IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE)     // write  reg , read reg2 , read reg3
-IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write  reg , read reg2 , read reg3, const
-
-IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, CNS)     // write  reg , read reg2 , read reg3 , read reg4
-//----------------------------------------------------------------------------
-// The following formats are used for direct addresses (e.g. static data members)
-//----------------------------------------------------------------------------
-
-IF_DEF(MRD,         IS_GM_RD,                   SPEC)     // read  [mem] (indirect call req. SPEC)
-IF_DEF(MWR,         IS_GM_WR,                   DSP)      // write [mem]
-IF_DEF(MRW,         IS_GM_RW,                   DSP)      // r/w   [mem]
-IF_DEF(MRD_OFF,     IS_GM_RD,                   DSP)      // offset mem
-
-IF_DEF(RRD_MRD,     IS_GM_RD|IS_R1_RD,          DSP)      // read   reg , read [mem]
-IF_DEF(RWR_MRD,     IS_GM_RD|IS_R1_WR,          DSP)      // write  reg , read [mem]
-IF_DEF(RRW_MRD,     IS_GM_RD|IS_R1_RW,          DSP)      // r/w    reg , read [mem]
-IF_DEF(RRW_MRD_CNS, IS_GM_RD|IS_R1_RW,          DSP_CNS)  // r/w    reg , read [mem], const
-
-IF_DEF(RWR_RRD_MRD, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP)      // write  reg , read reg2 , read [mem]
-IF_DEF(RWR_MRD_CNS, IS_GM_RD|IS_R1_WR,          DSP_CNS)  // write  reg , read [mem], const
-IF_DEF(RWR_RRD_MRD_CNS, IS_GM_RD|IS_R1_WR|IS_R2_RD, DSP_CNS) // write  reg , read reg2 , read [mem], const
-IF_DEF(RWR_RRD_MRD_RRD, IS_GM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, DSP_CNS) // write  reg , read reg2 , read [mem], read reg3
-IF_DEF(RWR_MRD_OFF, IS_GM_RD|IS_R1_WR,          DSP)      // write  reg , offset mem
-
-IF_DEF(MRD_RRD,     IS_GM_RD|IS_R1_RD,          DSP)      // read  [mem], read  reg
-IF_DEF(MWR_RRD,     IS_GM_WR|IS_R1_RD,          DSP)      // write [mem], read  reg
-IF_DEF(MRW_RRD,     IS_GM_RW|IS_R1_RD,          DSP)      // r/w   [mem], read  reg
-
-IF_DEF(MRD_CNS,     IS_GM_RD,                   DSP_CNS)  // read  [mem], const
-IF_DEF(MWR_CNS,     IS_GM_WR,                   DSP_CNS)  // write [mem], const
-IF_DEF(MRW_CNS,     IS_GM_RW,                   DSP_CNS)  // r/w   [mem], const
-
-IF_DEF(MWR_RRD_CNS, IS_GM_WR|IS_R1_RD,          DSP_CNS)  // write [mem], read reg, const
-
-IF_DEF(MRW_SHF,     IS_GM_RW,                   DSP_CNS)  // shift [mem], const
-
-//----------------------------------------------------------------------------
-// The following formats are used for stack frame refs
-//----------------------------------------------------------------------------
-
-IF_DEF(SRD,         IS_SF_RD,                   SPEC)     // read  [stk] (indirect call req. SPEC)
-IF_DEF(SWR,         IS_SF_WR,                   NONE)     // write [stk]
-IF_DEF(SRW,         IS_SF_RW,                   NONE)     // r/w   [stk]
-
-IF_DEF(RRD_SRD,     IS_SF_RD|IS_R1_RD,          NONE)     // read   reg , read [stk]
-IF_DEF(RWR_SRD,     IS_SF_RD|IS_R1_WR,          NONE)     // write  reg , read [stk]
-IF_DEF(RRW_SRD,     IS_SF_RD|IS_R1_RW,          NONE)     // r/w    reg , read [stk]
-IF_DEF(RRW_SRD_CNS, IS_SF_RD|IS_R1_RW,          CNS )     // r/w    reg , read [stk], const
-
-IF_DEF(RWR_RRD_SRD, IS_SF_RD|IS_R1_WR|IS_R2_RD, NONE)     // write  reg , read  reg2, read [stk]
-IF_DEF(RWR_SRD_CNS, IS_SF_RD|IS_R1_WR,          CNS )     // write  reg , read [stk], const
-IF_DEF(RWR_RRD_SRD_CNS, IS_SF_RD|IS_R1_WR|IS_R2_RD, CNS ) // write  reg , read  reg2, read [stk], const
-IF_DEF(RWR_RRD_SRD_RRD, IS_SF_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, CNS ) // write  reg , read  reg2, read [stk], read reg3
-
-IF_DEF(SRD_RRD,     IS_SF_RD|IS_R1_RD,          NONE)     // read  [stk], read  reg
-IF_DEF(SWR_RRD,     IS_SF_WR|IS_R1_RD,          NONE)     // write [stk], read  reg
-IF_DEF(SRW_RRD,     IS_SF_RW|IS_R1_RD,          NONE)     // r/w   [stk], read  reg
-
-IF_DEF(SRD_CNS,     IS_SF_RD,                   CNS )     // read  [stk], const
-IF_DEF(SWR_CNS,     IS_SF_WR,                   CNS )     // write [stk], const
-IF_DEF(SRW_CNS,     IS_SF_RW,                   CNS )     // r/w   [stk], const
-
-IF_DEF(SWR_RRD_CNS, IS_AM_WR|IS_R1_RD,          AMD_CNS)  // write [stk], read reg, const
-
-IF_DEF(SRW_SHF,     IS_SF_RW,                   CNS )     // shift [stk], const
-
-//----------------------------------------------------------------------------
-// The following formats are used for indirect address modes
-//----------------------------------------------------------------------------
-
-
-IF_DEF(ARD,         IS_AM_RD,                   SPEC)     // read  [adr] (indirect call req. SPEC)
-IF_DEF(AWR,         IS_AM_WR,                   AMD )     // write [adr]
-IF_DEF(ARW,         IS_AM_RW,                   AMD )     // r/w   [adr]
-
-IF_DEF(RRD_ARD,     IS_AM_RD|IS_R1_RD,          AMD )     // read   reg , read [adr]
-IF_DEF(RWR_ARD,     IS_AM_RD|IS_R1_WR,          AMD )     // write  reg , read [adr]
-IF_DEF(RRW_ARD,     IS_AM_RD|IS_R1_RW,          AMD )     // r/w    reg , read [adr]
-IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW,          AMD_CNS)  // r/w    reg , read [adr], const
-
-IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD )     // write  reg , read  reg2, read [adr]
-IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR,          AMD_CNS)  // write  reg , read [adr], const
-IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD)      // write  reg , read [adr], read reg2
-IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write  reg , read  reg2, read [adr], const
-IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write  reg , read  reg2, read [adr], read reg3
-
-IF_DEF(ARD_RRD,     IS_AM_RD|IS_R1_RD,          AMD )     // read  [adr], read  reg
-IF_DEF(AWR_RRD,     IS_AM_WR|IS_R1_RD,          AMD )     // write [adr], read  reg
-IF_DEF(ARW_RRD,     IS_AM_RW|IS_R1_RD,          AMD )     // r/w   [adr], read  reg
-
-IF_DEF(AWR_RRD_RRD, IS_AM_WR|IS_R1_RD|IS_R2_RD, AMD )     // write  [adr], read  reg, read  reg
-
-IF_DEF(ARD_CNS,     IS_AM_RD,                   AMD_CNS)  // read  [adr], const
-IF_DEF(AWR_CNS,     IS_AM_WR,                   AMD_CNS)  // write [adr], const
-IF_DEF(ARW_CNS,     IS_AM_RW,                   AMD_CNS)  // r/w   [adr], const
-
-IF_DEF(AWR_RRD_CNS, IS_AM_WR|IS_R1_RD,          AMD_CNS)  // write [adr], read reg, const
-
-IF_DEF(ARW_SHF,     IS_AM_RW,                   AMD_CNS)  // shift [adr], const
-
-//////////////////////////////////////////////////////////////////////////////
-
-#undef IF_DEF
-
-//////////////////////////////////////////////////////////////////////////////
-#endif // DEFINE_IS_OPS
-#endif // DEFINE_ID_OPS
-//////////////////////////////////////////////////////////////////////////////
-// clang-format on
diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h
index 564e1e452b6e..484eca3399b4 100644
--- a/src/coreclr/jit/emitinl.h
+++ b/src/coreclr/jit/emitinl.h
@@ -101,7 +101,7 @@ inline regNumber emitter::inst3opImulReg(instruction ins)
  *  get stored in different places within the instruction descriptor.
  */
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#ifdef TARGET_XARCH
 
 inline ssize_t emitter::emitGetInsAmd(instrDesc* id)
 {
@@ -335,50 +335,6 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id)
 
     id->idReg2((regNumber)encodeMask); // Save in idReg2
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // copy AMD64
-    assert(REGNUM_BITS >= 4);
-    encodeMask = 0;
-
-    if ((regmask & RBM_RSI) != RBM_NONE)
-    {
-        encodeMask |= 0x01;
-    }
-    if ((regmask & RBM_RDI) != RBM_NONE)
-    {
-        encodeMask |= 0x02;
-    }
-    if ((regmask & RBM_RBX) != RBM_NONE)
-    {
-        encodeMask |= 0x04;
-    }
-    if ((regmask & RBM_RBP) != RBM_NONE)
-    {
-        encodeMask |= 0x08;
-    }
-
-    id->idReg1((regNumber)encodeMask); // Save in idReg1
-
-    encodeMask = 0;
-
-    if ((regmask & RBM_R12) != RBM_NONE)
-    {
-        encodeMask |= 0x01;
-    }
-    if ((regmask & RBM_R13) != RBM_NONE)
-    {
-        encodeMask |= 0x02;
-    }
-    if ((regmask & RBM_R14) != RBM_NONE)
-    {
-        encodeMask |= 0x04;
-    }
-    if ((regmask & RBM_R15) != RBM_NONE)
-    {
-        encodeMask |= 0x08;
-    }
-
-    id->idReg2((regNumber)encodeMask); // Save in idReg2
-
 #else
     NYI("unknown target");
 #endif
@@ -490,45 +446,6 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id)
         regmask |= RBM_R27;
     if ((encodeMask & 0x10) != 0)
         regmask |= RBM_R28;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO: copy of AMD64
-    assert(REGNUM_BITS >= 4);
-    encodeMask = id->idReg1();
-
-    if ((encodeMask & 0x01) != 0)
-    {
-        regmask |= RBM_RSI;
-    }
-    if ((encodeMask & 0x02) != 0)
-    {
-        regmask |= RBM_RDI;
-    }
-    if ((encodeMask & 0x04) != 0)
-    {
-        regmask |= RBM_RBX;
-    }
-    if ((encodeMask & 0x08) != 0)
-    {
-        regmask |= RBM_RBP;
-    }
-
-    encodeMask = id->idReg2();
-
-    if ((encodeMask & 0x01) != 0)
-    {
-        regmask |= RBM_R12;
-    }
-    if ((encodeMask & 0x02) != 0)
-    {
-        regmask |= RBM_R13;
-    }
-    if ((encodeMask & 0x04) != 0)
-    {
-        regmask |= RBM_R14;
-    }
-    if ((encodeMask & 0x08) != 0)
-    {
-        regmask |= RBM_R15;
-    }
 
 #else
     NYI("unknown target");
diff --git a/src/coreclr/jit/emitjmps.h b/src/coreclr/jit/emitjmps.h
index 5b37fd152c6d..3d0f35ef1df7 100644
--- a/src/coreclr/jit/emitjmps.h
+++ b/src/coreclr/jit/emitjmps.h
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 // clang-format off
+#ifndef TARGET_WASM
 #ifndef JMP_SMALL
 #error Must define JMP_SMALL macro before including this file
 #endif
@@ -46,27 +47,6 @@ JMP_SMALL(lt    , ge    , blt    )  // LT
 JMP_SMALL(gt    , le    , bgt    )  // GT
 JMP_SMALL(le    , gt    , ble    )  // LE
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) //copying AMD64
-
-//       jump   reverse instruction
-JMP_SMALL(jmp, jmp, jmp)
-JMP_SMALL(jo, jno, jo)
-JMP_SMALL(jno, jo, jno)
-JMP_SMALL(jb, jae, jb)
-JMP_SMALL(jae, jb, jae)
-JMP_SMALL(je, jne, je)
-JMP_SMALL(jne, je, jne)
-JMP_SMALL(jbe, ja, jbe)
-JMP_SMALL(ja, jbe, ja)
-JMP_SMALL(js, jns, js)
-JMP_SMALL(jns, js, jns)
-JMP_SMALL(jp, jnp, jp)
-JMP_SMALL(jnp, jp, jnp)
-JMP_SMALL(jl, jge, jl)
-JMP_SMALL(jge, jl, jge)
-JMP_SMALL(jle, jg, jle)
-JMP_SMALL(jg, jle, jg)
-
 #else
   #error Unsupported or unset target architecture
 #endif // target type
@@ -74,5 +54,6 @@ JMP_SMALL(jg, jle, jg)
 /*****************************************************************************/
 #undef JMP_SMALL
 /*****************************************************************************/
+#endif // TARGET_WASM
 
 // clang-format on
diff --git a/src/coreclr/jit/emitwasm.cpp b/src/coreclr/jit/emitwasm.cpp
deleted file mode 100644
index 194af05aa127..000000000000
--- a/src/coreclr/jit/emitwasm.cpp
+++ /dev/null
@@ -1,7217 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XX                                                                           XX
-XX                             emitwasm.cpp                                   XX
-XX                                                                           XX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-*/
-
-#include "jitpch.h"
-#ifdef _MSC_VER
-#pragma hdrstop
-#endif
-
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
-
-/*****************************************************************************/
-/*****************************************************************************/
-
-#include "instr.h"
-#include "emit.h"
-#include "codegen.h"
-
-bool IsSSEInstruction(instruction ins)
-{
-    return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_SSE_INSTRUCTION);
-}
-
-bool IsSSEOrAVXInstruction(instruction ins)
-{
-    return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
-}
-
-bool IsAVXOnlyInstruction(instruction ins)
-{
-    return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
-}
-
-bool IsFMAInstruction(instruction ins)
-{
-    return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION);
-}
-
-bool IsBMIInstruction(instruction ins)
-{
-    return (ins >= INS_FIRST_BMI_INSTRUCTION) && (ins <= INS_LAST_BMI_INSTRUCTION);
-}
-
-regNumber getBmiRegNumber(instruction ins)
-{
-    switch (ins)
-    {
-        case INS_blsi:
-        {
-            return (regNumber)3;
-        }
-
-        case INS_blsmsk:
-        {
-            return (regNumber)2;
-        }
-
-        case INS_blsr:
-        {
-            return (regNumber)1;
-        }
-
-        default:
-        {
-            assert(IsBMIInstruction(ins));
-            return REG_NA;
-        }
-    }
-}
-
-regNumber getSseShiftRegNumber(instruction ins)
-{
-    switch (ins)
-    {
-        case INS_psrldq:
-        {
-            return (regNumber)3;
-        }
-
-        case INS_pslldq:
-        {
-            return (regNumber)7;
-        }
-
-        case INS_psrld:
-        case INS_psrlw:
-        case INS_psrlq:
-        {
-            return (regNumber)2;
-        }
-
-        case INS_pslld:
-        case INS_psllw:
-        case INS_psllq:
-        {
-            return (regNumber)6;
-        }
-
-        case INS_psrad:
-        case INS_psraw:
-        {
-            return (regNumber)4;
-        }
-
-        default:
-        {
-            assert(!"Invalid instruction for SSE2 instruction of the form: opcode reg, immed8");
-            return REG_NA;
-        }
-    }
-}
-
-bool emitter::IsAVXInstruction(instruction ins)
-{
-    return UseVEXEncoding() && IsSSEOrAVXInstruction(ins);
-}
-
-// Returns true if the AVX instruction is a binary operator that requires 3 operands.
-// When we emit an instruction with only two operands, we will duplicate the destination
-// as a source.
-// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to
-// be formalized by adding an additional field to instruction table to
-// to indicate whether a 3-operand instruction.
-bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
-{
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
-}
-
-// Returns true if the AVX instruction requires 3 operands that duplicate the source
-// register in the vvvv field.
-// TODO-XArch-Cleanup: This is a temporary solution for now. Eventually this needs to
-// be formalized by adding an additional field to instruction table to
-// to indicate whether a 3-operand instruction.
-bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
-{
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
-}
-
-//------------------------------------------------------------------------
-// AreUpper32BitsZero: check if some previously emitted
-//     instruction set the upper 32 bits of reg to zero.
-//
-// Arguments:
-//    reg - register of interest
-//
-// Return Value:
-//    true if previous instruction zeroed reg's upper 32 bits.
-//    false if it did not, or if we can't safely determine.
-//
-// Notes:
-//    Currently only looks back one instruction.
-//
-//    movsx eax, ... might seem viable but we always encode this
-//    instruction with a 64 bit destination. See TakesRexWPrefix.
-
-bool emitter::AreUpper32BitsZero(regNumber reg)
-{
-    // If there are no instructions in this IG, we can look back at
-    // the previous IG's instructions if this IG is an extension.
-    //
-    if ((emitCurIGinsCnt == 0) && ((emitCurIG->igFlags & IGF_EXTEND) == 0))
-    {
-        return false;
-    }
-
-    instrDesc* id  = emitLastIns;
-    insFormat  fmt = id->idInsFmt();
-
-    // This isn't meant to be a comprehensive check. Just look for what
-    // seems to be common.
-    switch (fmt)
-    {
-        case IF_RWR_CNS:
-        case IF_RRW_CNS:
-        case IF_RRW_SHF:
-        case IF_RWR_RRD:
-        case IF_RRW_RRD:
-        case IF_RWR_MRD:
-        case IF_RWR_SRD:
-        case IF_RWR_ARD:
-
-            // Bail if not writing to the right register
-            if (id->idReg1() != reg)
-            {
-                return false;
-            }
-
-            // Bail if movsx, we always have movsx sign extend to 8 bytes
-            if (id->idIns() == INS_movsx)
-            {
-                return false;
-            }
-
-            // movzx always zeroes the upper 32 bits.
-            if (id->idIns() == INS_movzx)
-            {
-                return true;
-            }
-
-            // Else rely on operation size.
-            return (id->idOpSize() == EA_4BYTE);
-
-        default:
-            break;
-    }
-
-    return false;
-}
-
-//------------------------------------------------------------------------
-// AreFlagsSetToZeroCmp: Checks if the previous instruction set the SZ, and optionally OC, flags to
-//                       the same values as if there were a compare to 0
-//
-// Arguments:
-//    reg - register of interest
-//    opSize - size of register
-//    needsOCFlags - additionally check the overflow and carry flags
-//
-// Return Value:
-//    true if the previous instruction set the flags for reg
-//    false if not, or if we can't safely determine
-//
-// Notes:
-//    Currently only looks back one instruction.
-bool emitter::AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, bool needsOCFlags)
-{
-    assert(reg != REG_NA);
-    // Don't look back across IG boundaries (possible control flow)
-    if (emitCurIGinsCnt == 0 && ((emitCurIG->igFlags & IGF_EXTEND) == 0))
-    {
-        return false;
-    }
-
-    instrDesc* id  = emitLastIns;
-    insFormat  fmt = id->idInsFmt();
-
-    // make sure op1 is a reg
-    switch (fmt)
-    {
-        case IF_RWR_CNS:
-        case IF_RRW_CNS:
-        case IF_RRW_SHF:
-        case IF_RWR_RRD:
-        case IF_RRW_RRD:
-        case IF_RWR_MRD:
-        case IF_RWR_SRD:
-        case IF_RRW_SRD:
-        case IF_RWR_ARD:
-        case IF_RRW_ARD:
-        case IF_RWR:
-        case IF_RRD:
-        case IF_RRW:
-            break;
-
-        default:
-            return false;
-    }
-
-    if (id->idReg1() != reg)
-    {
-        return false;
-    }
-
-    switch (id->idIns())
-    {
-        case INS_adc:
-        case INS_add:
-        case INS_dec:
-        case INS_dec_l:
-        case INS_inc:
-        case INS_inc_l:
-        case INS_neg:
-        case INS_shr_1:
-        case INS_shl_1:
-        case INS_sar_1:
-        case INS_sbb:
-        case INS_sub:
-        case INS_xadd:
-            if (needsOCFlags)
-            {
-                return false;
-            }
-            FALLTHROUGH;
-        // these always set OC to 0
-        case INS_and:
-        case INS_or:
-        case INS_xor:
-            return id->idOpSize() == opSize;
-
-        default:
-            break;
-    }
-
-    return false;
-}
-
-//------------------------------------------------------------------------
-// IsDstSrcImmAvxInstruction: Checks if the instruction has a "reg, reg/mem, imm" or
-//                            "reg/mem, reg, imm" form for the legacy, VEX, and EVEX
-//                            encodings.
-//
-// Arguments:
-//    instruction -- processor instruction to check
-//
-// Return Value:
-//    true if instruction has a "reg, reg/mem, imm" or "reg/mem, reg, imm" encoding
-//    form for the legacy, VEX, and EVEX encodings.
-//
-//    That is, the instruction takes two operands, one of which is immediate, and it
-//    does not need to encode any data in the VEX.vvvv field.
-//
-static bool IsDstSrcImmAvxInstruction(instruction ins)
-{
-    switch (ins)
-    {
-        case INS_aeskeygenassist:
-        case INS_extractps:
-        case INS_pextrb:
-        case INS_pextrw:
-        case INS_pextrd:
-        case INS_pextrq:
-        case INS_pshufd:
-        case INS_pshufhw:
-        case INS_pshuflw:
-        case INS_roundpd:
-        case INS_roundps:
-            return true;
-        default:
-            return false;
-    }
-}
-
-// -------------------------------------------------------------------
-// Is4ByteSSEInstruction: Returns true if the SSE instruction is a 4-byte opcode.
-//
-// Arguments:
-//    ins  -  instruction
-//
-// Note that this should be true for any of the instructions in instrsXArch.h
-// that use the SSE38 or SSE3A macro but returns false if the VEX encoding is
-// in use, since that encoding does not require an additional byte.
-bool emitter::Is4ByteSSEInstruction(instruction ins)
-{
-    return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins);
-}
-
-// Returns true if this instruction requires a VEX prefix
-// All AVX instructions require a VEX prefix
-bool emitter::TakesVexPrefix(instruction ins)
-{
-    // special case vzeroupper as it requires 2-byte VEX prefix
-    // special case the fencing, movnti and the prefetch instructions as they never take a VEX prefix
-    switch (ins)
-    {
-        case INS_lfence:
-        case INS_mfence:
-        case INS_movnti:
-        case INS_prefetchnta:
-        case INS_prefetcht0:
-        case INS_prefetcht1:
-        case INS_prefetcht2:
-        case INS_sfence:
-        case INS_vzeroupper:
-            return false;
-        default:
-            break;
-    }
-
-    return IsAVXInstruction(ins);
-}
-
-// Add base VEX prefix without setting W, R, X, or B bits
-// L bit will be set based on emitter attr.
-//
-// 2-byte VEX prefix = C5 <R,vvvv,L,pp>
-// 3-byte VEX prefix = C4 <R,X,B,m-mmmm> <W,vvvv,L,pp>
-//  - R, X, B, W - bits to express corresponding REX prefixes
-//  - m-mmmmm (5-bit)
-//    0-00001 - implied leading 0F opcode byte
-//    0-00010 - implied leading 0F 38 opcode bytes
-//    0-00011 - implied leading 0F 3A opcode bytes
-//    Rest    - reserved for future use and usage of them will uresult in Undefined instruction exception
-//
-// - vvvv (4-bits) - register specifier in 1's complement form; must be 1111 if unused
-// - L - scalar or AVX-128 bit operations (L=0),  256-bit operations (L=1)
-// - pp (2-bits) - opcode extension providing equivalent functionality of a SIMD size prefix
-//                 these prefixes are treated mandatory when used with escape opcode 0Fh for
-//                 some SIMD instructions
-//   00  - None   (0F    - packed float)
-//   01  - 66     (66 0F - packed double)
-//   10  - F3     (F3 0F - scalar float
-//   11  - F2     (F2 0F - scalar double)
-#define DEFAULT_3BYTE_VEX_PREFIX 0xC4E07800000000ULL
-#define DEFAULT_3BYTE_VEX_PREFIX_MASK 0xFFFFFF00000000ULL
-#define LBIT_IN_3BYTE_VEX_PREFIX 0x00000400000000ULL
-emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr attr)
-{
-    // The 2-byte VEX encoding is preferred when possible, but actually emitting
-    // it depends on a number of factors that we may not know until much later.
-    //
-    // In order to handle this "easily", we just carry the 3-byte encoding all
-    // the way through and "fix-up" the encoding when the VEX prefix is actually
-    // emitted, by simply checking that all the requirements were met.
-
-    // Only AVX instructions require VEX prefix
-    assert(IsAVXInstruction(ins));
-
-    // Shouldn't have already added VEX prefix
-    assert(!hasVexPrefix(code));
-
-    assert((code & DEFAULT_3BYTE_VEX_PREFIX_MASK) == 0);
-
-    code |= DEFAULT_3BYTE_VEX_PREFIX;
-
-    if (attr == EA_32BYTE)
-    {
-        // Set L bit to 1 in case of instructions that operate on 256-bits.
-        code |= LBIT_IN_3BYTE_VEX_PREFIX;
-    }
-
-    return code;
-}
-
-// Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix
-bool TakesRexWPrefix(instruction ins, emitAttr attr)
-{
-    // Because the current implementation of AVX does not have a way to distinguish between the register
-    // size specification (128 vs. 256 bits) and the operand size specification (32 vs. 64 bits), where both are
-    // required, the instruction must be created with the register size attribute (EA_16BYTE or EA_32BYTE),
-    // and here we must special case these by the opcode.
-    switch (ins)
-    {
-        case INS_vpermpd:
-        case INS_vpermq:
-        case INS_vpsrlvq:
-        case INS_vpsllvq:
-        case INS_pinsrq:
-        case INS_pextrq:
-        case INS_vfmadd132pd:
-        case INS_vfmadd213pd:
-        case INS_vfmadd231pd:
-        case INS_vfmadd132sd:
-        case INS_vfmadd213sd:
-        case INS_vfmadd231sd:
-        case INS_vfmaddsub132pd:
-        case INS_vfmaddsub213pd:
-        case INS_vfmaddsub231pd:
-        case INS_vfmsubadd132pd:
-        case INS_vfmsubadd213pd:
-        case INS_vfmsubadd231pd:
-        case INS_vfmsub132pd:
-        case INS_vfmsub213pd:
-        case INS_vfmsub231pd:
-        case INS_vfmsub132sd:
-        case INS_vfmsub213sd:
-        case INS_vfmsub231sd:
-        case INS_vfnmadd132pd:
-        case INS_vfnmadd213pd:
-        case INS_vfnmadd231pd:
-        case INS_vfnmadd132sd:
-        case INS_vfnmadd213sd:
-        case INS_vfnmadd231sd:
-        case INS_vfnmsub132pd:
-        case INS_vfnmsub213pd:
-        case INS_vfnmsub231pd:
-        case INS_vfnmsub132sd:
-        case INS_vfnmsub213sd:
-        case INS_vfnmsub231sd:
-        case INS_vpmaskmovq:
-        case INS_vpgatherdq:
-        case INS_vpgatherqq:
-        case INS_vgatherdpd:
-        case INS_vgatherqpd:
-            return true;
-        default:
-            break;
-    }
-
-#ifdef TARGET_AMD64
-    // movsx should always sign extend out to 8 bytes just because we don't track
-    // whether the dest should be 4 bytes or 8 bytes (attr indicates the size
-    // of the source, not the dest).
-    // A 4-byte movzx is equivalent to an 8 byte movzx, so it is not special
-    // cased here.
-    //
-    // Rex_jmp = jmp with rex prefix always requires rex.w prefix.
-    if (ins == INS_movsx || ins == INS_rex_jmp)
-    {
-        return true;
-    }
-
-    if (EA_SIZE(attr) != EA_8BYTE)
-    {
-        return false;
-    }
-
-    if (IsSSEOrAVXInstruction(ins))
-    {
-        switch (ins)
-        {
-            case INS_andn:
-            case INS_bextr:
-            case INS_blsi:
-            case INS_blsmsk:
-            case INS_blsr:
-            case INS_bzhi:
-            case INS_cvttsd2si:
-            case INS_cvttss2si:
-            case INS_cvtsd2si:
-            case INS_cvtss2si:
-            case INS_cvtsi2sd:
-            case INS_cvtsi2ss:
-            case INS_mov_xmm2i:
-            case INS_mov_i2xmm:
-            case INS_movnti:
-            case INS_mulx:
-            case INS_pdep:
-            case INS_pext:
-            case INS_rorx:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    // TODO-XArch-Cleanup: Better way to not emit REX.W when we don't need it, than just testing all these
-    // opcodes...
-    // These are all the instructions that default to 8-byte operand without the REX.W bit
-    // With 1 special case: movzx because the 4 byte version still zeros-out the hi 4 bytes
-    // so we never need it
-    if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) &&
-        (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && !((ins >= INS_i_jmp) && (ins <= INS_l_jg)))
-    {
-        return true;
-    }
-    else
-    {
-        return false;
-    }
-#else  //! TARGET_AMD64 = TARGET_X86
-    return false;
-#endif //! TARGET_AMD64
-}
-
-// Returns true if using this register will require a REX.* prefix.
-// Since XMM registers overlap with YMM registers, this routine
-// can also be used to know whether a YMM register if the
-// instruction in question is AVX.
-bool IsExtendedReg(regNumber reg)
-{
-#ifdef TARGET_AMD64
-    return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15));
-#else
-    // X86 JIT operates in 32-bit mode and hence extended reg are not available.
-    return false;
-#endif
-}
-
-// Returns true if using this register, for the given EA_SIZE(attr), will require a REX.* prefix
-bool IsExtendedReg(regNumber reg, emitAttr attr)
-{
-#ifdef TARGET_AMD64
-    // Not a register, so doesn't need a prefix
-    if (reg > REG_XMM15)
-    {
-        return false;
-    }
-
-    // Opcode field only has 3 bits for the register, these high registers
-    // need a 4th bit, that comes from the REX prefix (eiter REX.X, REX.R, or REX.B)
-    if (IsExtendedReg(reg))
-    {
-        return true;
-    }
-
-    if (EA_SIZE(attr) != EA_1BYTE)
-    {
-        return false;
-    }
-
-    // There are 12 one byte registers addressible 'below' r8b:
-    //     al, cl, dl, bl, ah, ch, dh, bh, spl, bpl, sil, dil.
-    // The first 4 are always addressible, the last 8 are divided into 2 sets:
-    //     ah,  ch,  dh,  bh
-    //          -- or --
-    //     spl, bpl, sil, dil
-    // Both sets are encoded exactly the same, the difference is the presence
-    // of a REX prefix, even a REX prefix with no other bits set (0x40).
-    // So in order to get to the second set we need a REX prefix (but no bits).
-    //
-    // TODO-AMD64-CQ: if we ever want to start using the first set, we'll need a different way of
-    // encoding/tracking/encoding registers.
-    return (reg >= REG_RSP);
-#else
-    // X86 JIT operates in 32-bit mode and hence extended reg are not available.
-    return false;
-#endif
-}
-
-// Since XMM registers overlap with YMM registers, this routine
-// can also used to know whether a YMM register in case of AVX instructions.
-bool IsXMMReg(regNumber reg)
-{
-#ifdef TARGET_AMD64
-    return (reg >= REG_XMM0) && (reg <= REG_XMM15);
-#else  // !TARGET_AMD64
-    return (reg >= REG_XMM0) && (reg <= REG_XMM7);
-#endif // !TARGET_AMD64
-}
-
-// Returns bits to be encoded in instruction for the given register.
-unsigned RegEncoding(regNumber reg)
-{
-    static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE");
-    return (unsigned)(reg & 0x7);
-}
-
-// Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes
-// SSE2: separate 1-byte prefix gets added before opcode.
-// AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
-emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
-        {
-            // W-bit is available only in 3-byte VEX prefix that starts with byte C4.
-            assert(hasVexPrefix(code));
-
-            // W-bit is the only bit that is added in non bit-inverted form.
-            return emitter::code_t(code | 0x00008000000000ULL);
-        }
-    }
-#ifdef TARGET_AMD64
-    return emitter::code_t(code | 0x4800000000ULL);
-#else
-    assert(!"UNREACHED");
-    return code;
-#endif
-}
-
-#ifdef TARGET_AMD64
-
-emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
-        {
-            // R-bit is supported by both 2-byte and 3-byte VEX prefix
-            assert(hasVexPrefix(code));
-
-            // R-bit is added in bit-inverted form.
-            return code & 0xFF7FFFFFFFFFFFULL;
-        }
-    }
-
-    return code | 0x4400000000ULL;
-}
-
-emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
-        {
-            // X-bit is available only in 3-byte VEX prefix that starts with byte C4.
-            assert(hasVexPrefix(code));
-
-            // X-bit is added in bit-inverted form.
-            return code & 0xFFBFFFFFFFFFFFULL;
-        }
-    }
-
-    return code | 0x4200000000ULL;
-}
-
-emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
-        {
-            // B-bit is available only in 3-byte VEX prefix that starts with byte C4.
-            assert(hasVexPrefix(code));
-
-            // B-bit is added in bit-inverted form.
-            return code & 0xFFDFFFFFFFFFFFULL;
-        }
-    }
-
-    return code | 0x4100000000ULL;
-}
-
-// Adds REX prefix (0x40) without W, R, X or B bits set
-emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
-{
-    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
-    return code | 0x4000000000ULL;
-}
-
-#endif // TARGET_AMD64
-
-bool isPrefix(BYTE b)
-{
-    assert(b != 0);    // Caller should check this
-    assert(b != 0x67); // We don't use the address size prefix
-    assert(b != 0x65); // The GS segment override prefix is emitted separately
-    assert(b != 0x64); // The FS segment override prefix is emitted separately
-    assert(b != 0xF0); // The lock prefix is emitted separately
-    assert(b != 0x2E); // We don't use the CS segment override prefix
-    assert(b != 0x3E); // Or the DS segment override prefix
-    assert(b != 0x26); // Or the ES segment override prefix
-    assert(b != 0x36); // Or the SS segment override prefix
-
-    // That just leaves the size prefixes used in SSE opcodes:
-    //      Scalar Double  Scalar Single  Packed Double
-    return ((b == 0xF2) || (b == 0xF3) || (b == 0x66));
-}
-
-// Outputs VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
-unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
-{
-    abort();
-}
-
-#ifdef TARGET_AMD64
-/*****************************************************************************
- * Is the last instruction emitted a call instruction?
- */
-bool emitter::emitIsLastInsCall()
-{
-    if ((emitLastIns != nullptr) && (emitLastIns->idIns() == INS_call))
-    {
-        return true;
-    }
-
-    return false;
-}
-
-/*****************************************************************************
- * We're about to create an epilog. If the last instruction we output was a 'call',
- * then we need to insert a NOP, to allow for proper exception-handling behavior.
- */
-void emitter::emitOutputPreEpilogNOP()
-{
-    if (emitIsLastInsCall())
-    {
-        emitIns(INS_nop);
-    }
-}
-
-#endif // TARGET_AMD64
-
-// Size of rex prefix in bytes
-unsigned emitter::emitGetRexPrefixSize(instruction ins)
-{
-    // In case of AVX instructions, REX prefixes are part of VEX prefix.
-    // And hence requires no additional byte to encode REX prefixes.
-    if (IsAVXInstruction(ins))
-    {
-        return 0;
-    }
-
-    // If not AVX, then we would need 1-byte to encode REX prefix.
-    return 1;
-}
-
-// Size of vex prefix in bytes
-unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
-{
-    if (IsAVXInstruction(ins))
-    {
-        return 3;
-    }
-
-    // If not AVX, then we don't need to encode vex prefix.
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
-// configuration.
-//
-// Arguments:
-//    ins   -- The instruction being emitted
-//    attr  -- The emit attribute
-//    code  -- The current opcode and any known prefixes
-unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code)
-{
-    unsigned adjustedSize = 0;
-
-    if (IsAVXInstruction(ins))
-    {
-        // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
-        // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
-        // overstimate.
-        // Instead this routine will adjust the size of VEX prefix based on the number of bytes of opcode it encodes so
-        // that
-        // instruction size estimate will be accurate.
-        // Basically this  will decrease the vexPrefixSize, so that opcodeSize + vexPrefixAdjustedSize will be the right
-        // size.
-        //
-        // rightOpcodeSize + vexPrefixSize
-        //  = (opcodeSize - ExtrabytesSize) + vexPrefixSize
-        //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
-        //  = opcodeSize + vexPrefixAdjustedSize
-
-        unsigned vexPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr);
-        assert(vexPrefixAdjustedSize == 3);
-
-        // In this case, opcode will contains escape prefix at least one byte,
-        // vexPrefixAdjustedSize should be minus one.
-        vexPrefixAdjustedSize -= 1;
-
-        // Get the fourth byte in Opcode.
-        // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not.
-        BYTE check = (code >> 24) & 0xFF;
-        if (check != 0)
-        {
-            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
-            // 4-byte opcode: with the bytes ordered as 0x22114433
-            // Simd prefix is at the first byte.
-            BYTE sizePrefix = (code >> 16) & 0xFF;
-            if (sizePrefix != 0 && isPrefix(sizePrefix))
-            {
-                vexPrefixAdjustedSize -= 1;
-            }
-
-            // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode.
-            // But in this case the opcode has not counted R\M part.
-            // opcodeSize + VexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize
-            //=opcodeSize + VexPrefixAdjustedSize -1 + 1
-            //=opcodeSize + VexPrefixAdjustedSize
-            // So although we may have second byte escape prefix, we won't decrease vexPrefixAdjustedSize.
-        }
-
-        adjustedSize = vexPrefixAdjustedSize;
-    }
-    else if (Is4ByteSSEInstruction(ins))
-    {
-        // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte
-        adjustedSize++;
-    }
-    else
-    {
-        if (ins == INS_crc32)
-        {
-            // Adjust code size for CRC32 that has 4-byte opcode but does not use SSE38 or EES3A encoding.
-            adjustedSize++;
-        }
-
-        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
-        {
-            // Most 16-bit operand instructions will need a 0x66 prefix.
-            adjustedSize++;
-        }
-    }
-
-    return adjustedSize;
-}
-
-// Get size of rex or vex prefix emitted in code
-unsigned emitter::emitGetPrefixSize(code_t code)
-{
-    if (hasVexPrefix(code))
-    {
-        return 3;
-    }
-
-    if (hasRexPrefix(code))
-    {
-        return 1;
-    }
-
-    return 0;
-}
-
-#ifdef TARGET_X86
-/*****************************************************************************
- *
- *  Record a non-empty stack
- */
-
-void emitter::emitMarkStackLvl(unsigned stackLevel)
-{
-    assert(int(stackLevel) >= 0);
-    assert(emitCurStackLvl == 0);
-    assert(emitCurIG->igStkLvl == 0);
-    assert(emitCurIGfreeNext == emitCurIGfreeBase);
-
-    assert(stackLevel && stackLevel % sizeof(int) == 0);
-
-    emitCurStackLvl = emitCurIG->igStkLvl = stackLevel;
-
-    if (emitMaxStackDepth < emitCurStackLvl)
-    {
-        JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
-        emitMaxStackDepth = emitCurStackLvl;
-    }
-}
-#endif
-
-/*****************************************************************************
- *
- *  Get hold of the address mode displacement value for an indirect call.
- */
-
-//inline ssize_t emitter::emitGetInsCIdisp(instrDesc* id)
-//{
-//    if (id->idIsLargeCall())
-//    {
-//        return ((instrDescCGCA*)id)->idcDisp;
-//    }
-//    else
-//    {
-//        assert(!id->idIsLargeDsp());
-//        assert(!id->idIsLargeCns());
-//
-//        return id->idAddr()->iiaAddrMode.amDisp;
-//    }
-//}
-
-/** ***************************************************************************
- *
- *  The following table is used by the instIsFP()/instUse/DefFlags() helpers.
- */
-
-// clang-format off
-const insFlags      CodeGenInterface::instInfo[] =
-{
-    #define INST0(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
-    #define INST1(id, nm, um, mr,                 flags) static_cast<insFlags>(flags),
-    #define INST2(id, nm, um, mr, mi,             flags) static_cast<insFlags>(flags),
-    #define INST3(id, nm, um, mr, mi, rm,         flags) static_cast<insFlags>(flags),
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) static_cast<insFlags>(flags),
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) static_cast<insFlags>(flags),
-    #include "instrs.h"
-    #undef  INST0
-    #undef  INST1
-    #undef  INST2
-    #undef  INST3
-    #undef  INST4
-    #undef  INST5
-};
-// clang-format on
-
-/*****************************************************************************
- *
- *  Initialize the table used by emitInsModeFormat().
- */
-
-// clang-format off
-const BYTE          emitter::emitInsModeFmtTab[] =
-{
-    #define INST0(id, nm, um, mr,                 flags) um,
-    #define INST1(id, nm, um, mr,                 flags) um,
-    #define INST2(id, nm, um, mr, mi,             flags) um,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) um,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) um,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) um,
-    #include "instrs.h"
-    #undef  INST0
-    #undef  INST1
-    #undef  INST2
-    #undef  INST3
-    #undef  INST4
-    #undef  INST5
-};
-// clang-format on
-
-#ifdef DEBUG
-unsigned const emitter::emitInsModeFmtCnt = _countof(emitInsModeFmtTab);
-#endif
-
-/*****************************************************************************
- *
- *  Combine the given base format with the update mode of the instuction.
- */
-
-inline emitter::insFormat emitter::emitInsModeFormat(instruction ins, insFormat base)
-{
-    assert(IF_RRD + IUM_RD == IF_RRD);
-    assert(IF_RRD + IUM_WR == IF_RWR);
-    assert(IF_RRD + IUM_RW == IF_RRW);
-
-    return (insFormat)(base + emitInsUpdateMode(ins));
-}
-
-// This is a helper we need due to Vs Whidbey #254016 in order to distinguish
-// if we can not possibly be updating an integer register. This is not the best
-// solution, but the other ones (see bug) are going to be much more complicated.
-bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
-{
-    instruction ins = id->idIns();
-
-    if (!IsSSEOrAVXInstruction(ins))
-    {
-        return false;
-    }
-
-    switch (ins)
-    {
-        case INS_andn:
-        case INS_bextr:
-        case INS_blsi:
-        case INS_blsmsk:
-        case INS_blsr:
-        case INS_bzhi:
-        case INS_cvttsd2si:
-        case INS_cvttss2si:
-        case INS_cvtsd2si:
-        case INS_cvtss2si:
-        case INS_extractps:
-        case INS_mov_xmm2i:
-        case INS_movmskpd:
-        case INS_movmskps:
-        case INS_mulx:
-        case INS_pdep:
-        case INS_pext:
-        case INS_pmovmskb:
-        case INS_pextrb:
-        case INS_pextrd:
-        case INS_pextrq:
-        case INS_pextrw:
-        case INS_pextrw_sse41:
-        case INS_rorx:
-        {
-            // These SSE instructions write to a general purpose integer register.
-            return false;
-        }
-
-        default:
-        {
-            return true;
-        }
-    }
-}
-
-/*****************************************************************************
- *
- *  Returns the base encoding of the given CPU instruction.
- */
-
-inline size_t insCode(instruction ins)
-{
-    // clang-format off
-    const static
-    size_t          insCodes[] =
-    {
-        #define INST0(id, nm, um, mr,                 flags) mr,
-        #define INST1(id, nm, um, mr,                 flags) mr,
-        #define INST2(id, nm, um, mr, mi,             flags) mr,
-        #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
-        #include "instrs.h"
-        #undef  INST0
-        #undef  INST1
-        #undef  INST2
-        #undef  INST3
-        #undef  INST4
-        #undef  INST5
-    };
-    // clang-format on
-
-    assert((unsigned)ins < _countof(insCodes));
-    assert((insCodes[ins] != BAD_CODE));
-
-    return insCodes[ins];
-}
-
-/*****************************************************************************
- *
- *  Returns the "AL/AX/EAX, imm" accumulator encoding of the given instruction.
- */
-
-inline size_t insCodeACC(instruction ins)
-{
-    // clang-format off
-    const static
-    size_t          insCodesACC[] =
-    {
-        #define INST0(id, nm, um, mr,                 flags)
-        #define INST1(id, nm, um, mr,                 flags)
-        #define INST2(id, nm, um, mr, mi,             flags)
-        #define INST3(id, nm, um, mr, mi, rm,         flags)
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) a4,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) a4,
-        #include "instrs.h"
-        #undef  INST0
-        #undef  INST1
-        #undef  INST2
-        #undef  INST3
-        #undef  INST4
-        #undef  INST5
-    };
-    // clang-format on
-
-    assert((unsigned)ins < _countof(insCodesACC));
-    assert((insCodesACC[ins] != BAD_CODE));
-
-    return insCodesACC[ins];
-}
-
-/*****************************************************************************
- *
- *  Returns the "register" encoding of the given CPU instruction.
- */
-
-inline size_t insCodeRR(instruction ins)
-{
-    // clang-format off
-    const static
-    size_t          insCodesRR[] =
-    {
-        #define INST0(id, nm, um, mr,                 flags)
-        #define INST1(id, nm, um, mr,                 flags)
-        #define INST2(id, nm, um, mr, mi,             flags)
-        #define INST3(id, nm, um, mr, mi, rm,         flags)
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags)
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rr,
-        #include "instrs.h"
-        #undef  INST0
-        #undef  INST1
-        #undef  INST2
-        #undef  INST3
-        #undef  INST4
-        #undef  INST5
-    };
-    // clang-format on
-
-    assert((unsigned)ins < _countof(insCodesRR));
-    assert((insCodesRR[ins] != BAD_CODE));
-
-    return insCodesRR[ins];
-}
-
-// clang-format off
-const static
-size_t          insCodesRM[] =
-{
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags)
-    #define INST2(id, nm, um, mr, mi,             flags)
-    #define INST3(id, nm, um, mr, mi, rm,         flags) rm,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) rm,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) rm,
-    #include "instrs.h"
-    #undef  INST0
-    #undef  INST1
-    #undef  INST2
-    #undef  INST3
-    #undef  INST4
-    #undef  INST5
-};
-// clang-format on
-
-// Returns true iff the give CPU instruction has an RM encoding.
-inline bool hasCodeRM(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesRM));
-    return ((insCodesRM[ins] != BAD_CODE));
-}
-
-/*****************************************************************************
- *
- *  Returns the "reg, [r/m]" encoding of the given CPU instruction.
- */
-
-inline size_t insCodeRM(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesRM));
-    assert((insCodesRM[ins] != BAD_CODE));
-
-    return insCodesRM[ins];
-}
-
-// clang-format off
-const static
-size_t          insCodesMI[] =
-{
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags)
-    #define INST2(id, nm, um, mr, mi,             flags) mi,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) mi,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mi,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mi,
-    #include "instrs.h"
-    #undef  INST0
-    #undef  INST1
-    #undef  INST2
-    #undef  INST3
-    #undef  INST4
-    #undef  INST5
-};
-// clang-format on
-
-// Returns true iff the give CPU instruction has an MI encoding.
-inline bool hasCodeMI(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesMI));
-    return ((insCodesMI[ins] != BAD_CODE));
-}
-
-/*****************************************************************************
- *
- *  Returns the "[r/m], 32-bit icon" encoding of the given CPU instruction.
- */
-
-inline size_t insCodeMI(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesMI));
-    assert((insCodesMI[ins] != BAD_CODE));
-
-    return insCodesMI[ins];
-}
-
-// clang-format off
-const static
-size_t          insCodesMR[] =
-{
-    #define INST0(id, nm, um, mr,                 flags)
-    #define INST1(id, nm, um, mr,                 flags) mr,
-    #define INST2(id, nm, um, mr, mi,             flags) mr,
-    #define INST3(id, nm, um, mr, mi, rm,         flags) mr,
-    #define INST4(id, nm, um, mr, mi, rm, a4,     flags) mr,
-    #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) mr,
-    #include "instrs.h"
-    #undef  INST0
-    #undef  INST1
-    #undef  INST2
-    #undef  INST3
-    #undef  INST4
-    #undef  INST5
-};
-// clang-format on
-
-// Returns true iff the give CPU instruction has an MR encoding.
-inline bool hasCodeMR(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesMR));
-    return ((insCodesMR[ins] != BAD_CODE));
-}
-
-/*****************************************************************************
- *
- *  Returns the "[r/m], reg" or "[r/m]" encoding of the given CPU instruction.
- */
-
-inline size_t insCodeMR(instruction ins)
-{
-    assert((unsigned)ins < _countof(insCodesMR));
-    assert((insCodesMR[ins] != BAD_CODE));
-
-    return insCodesMR[ins];
-}
-
-// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
-bool emitter::EncodedBySSE38orSSE3A(instruction ins)
-{
-    const size_t SSE38 = 0x0F660038;
-    const size_t SSE3A = 0x0F66003A;
-    const size_t MASK  = 0xFFFF00FF;
-
-    size_t insCode = 0;
-
-    if (!IsSSEOrAVXInstruction(ins))
-    {
-        return false;
-    }
-
-    if (hasCodeRM(ins))
-    {
-        insCode = insCodeRM(ins);
-    }
-    else if (hasCodeMI(ins))
-    {
-        insCode = insCodeMI(ins);
-    }
-    else if (hasCodeMR(ins))
-    {
-        insCode = insCodeMR(ins);
-    }
-
-    insCode &= MASK;
-    return insCode == SSE38 || insCode == SSE3A;
-}
-
-/*****************************************************************************
- *
- *  Returns an encoding for the specified register to be used in the bit0-2
- *  part of an opcode.
- */
-
-inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code)
-{
-    assert(reg < REG_STK);
-
-#ifdef TARGET_AMD64
-    // Either code is not NULL or reg is not an extended reg.
-    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
-    // which would require code != NULL.
-    assert(code != nullptr || !IsExtendedReg(reg));
-
-    if (IsExtendedReg(reg))
-    {
-        *code = AddRexBPrefix(ins, *code); // REX.B
-    }
-    else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr))
-    {
-        // We are assuming that we only use/encode SPL, BPL, SIL and DIL
-        // not the corresponding AH, CH, DH, or BH
-        *code = AddRexPrefix(ins, *code); // REX
-    }
-#endif // TARGET_AMD64
-
-    unsigned regBits = RegEncoding(reg);
-
-    assert(regBits < 8);
-    return regBits;
-}
-
-/*****************************************************************************
- *
- *  Returns an encoding for the specified register to be used in the bit3-5
- *  part of an opcode.
- */
-
-inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code)
-{
-    assert(reg < REG_STK);
-
-#ifdef TARGET_AMD64
-    // Either code is not NULL or reg is not an extended reg.
-    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
-    // which would require code != NULL.
-    assert(code != nullptr || !IsExtendedReg(reg));
-
-    if (IsExtendedReg(reg))
-    {
-        *code = AddRexRPrefix(ins, *code); // REX.R
-    }
-    else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr))
-    {
-        // We are assuming that we only use/encode SPL, BPL, SIL and DIL
-        // not the corresponding AH, CH, DH, or BH
-        *code = AddRexPrefix(ins, *code); // REX
-    }
-#endif // TARGET_AMD64
-
-    unsigned regBits = RegEncoding(reg);
-
-    assert(regBits < 8);
-    return (regBits << 3);
-}
-
-/***********************************************************************************
- *
- *  Returns modified AVX opcode with the specified register encoded in bits 3-6 of
- *  byte 2 of VEX prefix.
- */
-inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code)
-{
-    assert(reg < REG_STK);
-    assert(IsAVXInstruction(ins));
-    assert(hasVexPrefix(code));
-
-    // Get 4-bit register encoding
-    // RegEncoding() gives lower 3 bits
-    // IsExtendedReg() gives MSB.
-    code_t regBits = RegEncoding(reg);
-    if (IsExtendedReg(reg))
-    {
-        regBits |= 0x08;
-    }
-
-    // VEX prefix encodes register operand in 1's complement form
-    // Shift count = 4-bytes of opcode + 0-2 bits
-    assert(regBits <= 0xF);
-    regBits <<= 35;
-    return code ^ regBits;
-}
-
-/*****************************************************************************
- *
- *  Returns an encoding for the specified register to be used in the bit3-5
- *  part of an SIB byte (unshifted).
- *  Used exclusively to generate the REX.X bit and truncate the register.
- */
-
-inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code)
-{
-    assert(reg < REG_STK);
-
-#ifdef TARGET_AMD64
-    // Either code is not NULL or reg is not an extended reg.
-    // If reg is an extended reg, instruction needs to be prefixed with 'REX'
-    // which would require code != NULL.
-    assert(code != nullptr || reg < REG_R8 || (reg >= REG_XMM0 && reg < REG_XMM8));
-
-    if (IsExtendedReg(reg))
-    {
-        *code = AddRexXPrefix(ins, *code); // REX.X
-    }
-    unsigned regBits = RegEncoding(reg);
-#else  // !TARGET_AMD64
-    unsigned regBits = reg;
-#endif // !TARGET_AMD64
-
-    assert(regBits < 8);
-    return regBits;
-}
-
-/*****************************************************************************
- *
- *  Returns the "[r/m]" opcode with the mod/RM field set to register.
- */
-
-inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code)
-{
-    // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes.
-    // Otherwise, it will be placed after the 4 byte encoding.
-    if ((code & 0xFF00) == 0)
-    {
-        assert((code & 0xC000) == 0);
-        code |= 0xC000;
-    }
-
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Returns the given "[r/m]" opcode with the mod/RM field set to register.
- */
-
-inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code)
-{
-    // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes.
-    // Otherwise, it will be placed after the 4 byte encoding.
-    if ((code & 0xFF00) == 0)
-    {
-        assert((code & 0xC000) == 0);
-        code |= 0xC000;
-    }
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Returns the "byte ptr [r/m]" opcode with the mod/RM field set to
- *  the given register.
- */
-
-inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code)
-{
-    assert((code & 0xC000) == 0);
-    code |= 0xC000;
-    unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8;
-    code |= regcode;
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Returns the "byte ptr [r/m], icon" opcode with the mod/RM field set to
- *  the given register.
- */
-
-inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code)
-{
-    assert((code & 0xC000) == 0);
-    code |= 0xC000;
-    unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8;
-    code |= regcode;
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Returns true iff the given instruction does not have a "[r/m], icon" form, but *does* have a
- *  "reg,reg,imm8" form.
- */
-inline bool insNeedsRRIb(instruction ins)
-{
-    // If this list gets longer, use a switch or a table.
-    return ins == INS_imul;
-}
-
-/*****************************************************************************
- *
- *  Returns the "reg,reg,imm8" opcode with both the reg's set to the
- *  the given register.
- */
-inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size)
-{
-    assert(size == EA_4BYTE); // All we handle for now.
-    assert(insNeedsRRIb(ins));
-    // If this list gets longer, use a switch, or a table lookup.
-    code_t   code    = 0x69c0;
-    unsigned regcode = insEncodeReg012(ins, reg, size, &code);
-    // We use the same register as source and destination.  (Could have another version that does both regs...)
-    code |= regcode;
-    code |= (regcode << 3);
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Returns the "+reg" opcode with the the given register set into the low
- *  nibble of the opcode
- */
-
-inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size)
-{
-    code_t   code    = insCodeRR(ins);
-    unsigned regcode = insEncodeReg012(ins, reg, size, &code);
-    code |= regcode;
-    return code;
-}
-
-/*****************************************************************************
- *
- *  Return the 'SS' field value for the given index scale factor.
- */
-
-inline unsigned emitter::insSSval(unsigned scale)
-{
-    assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
-
-    const static BYTE scales[] = {
-        0x00, // 1
-        0x40, // 2
-        0xFF, // 3
-        0x80, // 4
-        0xFF, // 5
-        0xFF, // 6
-        0xFF, // 7
-        0xC0, // 8
-    };
-
-    return scales[scale - 1];
-}
-
-const instruction emitJumpKindInstructions[] = {INS_nop,
-
-#define JMP_SMALL(en, rev, ins) INS_##ins,
-#include "emitjmps.h"
-
-                                                INS_call};
-
-const emitJumpKind emitReverseJumpKinds[] = {
-    EJ_NONE,
-
-#define JMP_SMALL(en, rev, ins) EJ_##rev,
-#include "emitjmps.h"
-};
-
-/*****************************************************************************
- * Look up the instruction for a jump kind
- */
-
-/*static*/ instruction emitter::emitJumpKindToIns(emitJumpKind jumpKind)
-{
-    assert((unsigned)jumpKind < ArrLen(emitJumpKindInstructions));
-    return emitJumpKindInstructions[jumpKind];
-}
-
-/*****************************************************************************
- * Reverse the conditional jump
- */
-
-/* static */ emitJumpKind emitter::emitReverseJumpKind(emitJumpKind jumpKind)
-{
-    assert(jumpKind < EJ_COUNT);
-    return emitReverseJumpKinds[jumpKind];
-}
-
-/*****************************************************************************
- * The size for these instructions is less than EA_4BYTE,
- * but the target register need not be byte-addressable
- */
-
-inline bool emitInstHasNoCode(instruction ins)
-{
-    if (ins == INS_align)
-    {
-        return true;
-    }
-
-    return false;
-}
-
-/*****************************************************************************
- * When encoding instructions that operate on byte registers
- * we have to ensure that we use a low register (EAX, EBX, ECX or EDX)
- * otherwise we will incorrectly encode the instruction
- */
-
-bool emitter::emitVerifyEncodable(instruction ins, emitAttr size, regNumber reg1, regNumber reg2 /* = REG_NA */)
-{
-#if CPU_HAS_BYTE_REGS
-    if (size != EA_1BYTE) // Not operating on a byte register is fine
-    {
-        return true;
-    }
-
-    if ((ins != INS_movsx) && // These three instructions support high register
-        (ins != INS_movzx)    // encodings for reg1
-#ifdef FEATURE_HW_INTRINSICS
-        && (ins != INS_crc32)
-#endif
-            )
-    {
-        // reg1 must be a byte-able register
-        if ((genRegMask(reg1) & RBM_BYTE_REGS) == 0)
-        {
-            return false;
-        }
-    }
-    // if reg2 is not REG_NA then reg2 must be a byte-able register
-    if ((reg2 != REG_NA) && ((genRegMask(reg2) & RBM_BYTE_REGS) == 0))
-    {
-        return false;
-    }
-#endif
-    // The instruction can be encoded
-    return true;
-}
-
-/*****************************************************************************
- *
- *  Estimate the size (in bytes of generated code) of the given instruction.
- */
-
-inline UNATIVE_OFFSET emitter::emitInsSize(code_t code)
-{
-    UNATIVE_OFFSET size = (code & 0xFF000000) ? 4 : (code & 0x00FF0000) ? 3 : 2;
-#ifdef TARGET_AMD64
-    size += emitGetPrefixSize(code);
-#endif
-    return size;
-}
-
-//------------------------------------------------------------------------
-// emitInsSizeRR: Determines the code size for an instruction encoding that does not have any addressing modes
-//
-// Arguments:
-//    ins   -- The instruction being emitted
-//    code  -- The current opcode and any known prefixes
-inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
-{
-    assert(false);
-    //assert(id->idIns() != INS_invalid);
-
-    //instruction ins  = id->idIns();
-    //emitAttr    attr = id->idOpSize();
-
-    //UNATIVE_OFFSET sz = emitInsSize(code);
-
-    //sz += emitGetAdjustedSize(ins, attr, code);
-
-    //// REX prefix
-    //if (TakesRexWPrefix(ins, attr) || IsExtendedReg(id->idReg1(), attr) || IsExtendedReg(id->idReg2(), attr) ||
-    //    (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr))))
-    //{
-    //    sz += emitGetRexPrefixSize(ins);
-    //}
-
-    //return sz;
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// emitInsSizeRR: Determines the code size for an instruction encoding that does not have any addressing modes and
-// includes an immediate value
-//
-// Arguments:
-//    ins   -- The instruction being emitted
-//    code  -- The current opcode and any known prefixes
-//    val   -- The immediate value to encode
-inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code, int val)
-{
-    instruction    ins       = id->idIns();
-    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
-    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(valSize <= sizeof(INT32) || !id->idIsCnsReloc());
-#endif // TARGET_AMD64
-
-    if (valSize > sizeof(INT32))
-    {
-        valSize = sizeof(INT32);
-    }
-
-    if (id->idIsCnsReloc())
-    {
-        valInByte = false; // relocs can't be placed in a byte
-        assert(valSize == sizeof(INT32));
-    }
-
-    if (valInByte)
-    {
-        valSize = sizeof(char);
-    }
-    else
-    {
-        assert(!IsSSEOrAVXInstruction(ins));
-    }
-
-    return valSize + emitInsSizeRR(id, code);
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr)
-{
-    emitAttr size = EA_SIZE(attr);
-
-    UNATIVE_OFFSET sz;
-
-    // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes.
-    // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes.
-    // This would probably be better expressed as a different format or something?
-    code_t code = insCodeRM(ins);
-
-    if ((code & 0xFF00) != 0)
-    {
-        sz = IsSSEOrAVXInstruction(ins) ? emitInsSize(code) : 5;
-    }
-    else
-    {
-        sz = emitInsSize(insEncodeRMreg(ins, code));
-    }
-
-    sz += emitGetAdjustedSize(ins, size, insCodeRM(ins));
-
-    // REX prefix
-    if (!hasRexPrefix(code))
-    {
-        if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) ||
-            IsExtendedReg(reg2, attr))
-        {
-            sz += emitGetRexPrefixSize(ins);
-        }
-    }
-
-    return sz;
-}
-
-/*****************************************************************************/
-
-inline UNATIVE_OFFSET emitter::emitInsSizeSV(code_t code, int var, int dsp)
-{
-    UNATIVE_OFFSET size = emitInsSize(code);
-    UNATIVE_OFFSET offs;
-    bool           offsIsUpperBound = true;
-    bool           EBPbased         = true;
-
-    /*  Is this a temporary? */
-
-    if (var < 0)
-    {
-        /* An address off of ESP takes an extra byte */
-
-        if (!emitHasFramePtr)
-        {
-            size++;
-        }
-
-        // The offset is already assigned. Find the temp.
-        TempDsc* tmp = codeGen->regSet.tmpFindNum(var, RegSet::TEMP_USAGE_USED);
-        if (tmp == nullptr)
-        {
-            // It might be in the free lists, if we're working on zero initializing the temps.
-            tmp = codeGen->regSet.tmpFindNum(var, RegSet::TEMP_USAGE_FREE);
-        }
-        assert(tmp != nullptr);
-        offs = tmp->tdTempOffs();
-
-        // We only care about the magnitude of the offset here, to determine instruction size.
-        if (emitComp->isFramePointerUsed())
-        {
-            if ((int)offs < 0)
-            {
-                offs = -(int)offs;
-            }
-        }
-        else
-        {
-            // SP-based offsets must already be positive.
-            assert((int)offs >= 0);
-        }
-    }
-    else
-    {
-
-        /* Get the frame offset of the (non-temp) variable */
-
-        offs = dsp + emitComp->lvaFrameAddress(var, &EBPbased);
-
-        /* An address off of ESP takes an extra byte */
-
-        if (!EBPbased)
-        {
-            ++size;
-        }
-
-        /* Is this a stack parameter reference? */
-
-        if ((emitComp->lvaIsParameter(var)
-#if !defined(TARGET_AMD64) || defined(UNIX_AMD64_ABI)
-             && !emitComp->lvaIsRegArgument(var)
-#endif // !TARGET_AMD64 || UNIX_AMD64_ABI
-                 ) ||
-            (static_cast<unsigned>(var) == emitComp->lvaRetAddrVar))
-        {
-            /* If no EBP frame, arguments and ret addr are off of ESP, above temps */
-
-            if (!EBPbased)
-            {
-                assert((int)offs >= 0);
-
-                offsIsUpperBound = false; // since #temps can increase
-                offs += emitMaxTmpSize;
-            }
-        }
-        else
-        {
-            /* Locals off of EBP are at negative offsets */
-
-            if (EBPbased)
-            {
-#if defined(TARGET_AMD64) && !defined(UNIX_AMD64_ABI)
-                // If localloc is not used, then ebp chaining is done and hence
-                // offset of locals will be at negative offsets, Otherwise offsets
-                // will be positive.  In future, when RBP gets positioned in the
-                // middle of the frame so as to optimize instruction encoding size,
-                // the below asserts needs to be modified appropriately.
-                // However, for Unix platforms, we always do frame pointer chaining,
-                // so offsets from the frame pointer will always be negative.
-                if (emitComp->compLocallocUsed || emitComp->opts.compDbgEnC)
-                {
-                    noway_assert((int)offs >= 0);
-                }
-                else
-#endif
-                {
-                    // Dev10 804810 - failing this assert can lead to bad codegen and runtime crashes
-                    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef UNIX_AMD64_ABI
-                    LclVarDsc* varDsc         = emitComp->lvaTable + var;
-                    bool       isRegPassedArg = varDsc->lvIsParam && varDsc->lvIsRegArg;
-                    // Register passed args could have a stack offset of 0.
-                    noway_assert((int)offs < 0 || isRegPassedArg || emitComp->opts.IsOSR());
-#else  // !UNIX_AMD64_ABI
-
-                    // OSR transitioning to RBP frame currently can have mid-frame FP
-                    noway_assert(((int)offs < 0) || emitComp->opts.IsOSR());
-#endif // !UNIX_AMD64_ABI
-                }
-
-                assert(emitComp->lvaTempsHaveLargerOffsetThanVars());
-
-                // lvaInlinedPInvokeFrameVar and lvaStubArgumentVar are placed below the temps
-                if (unsigned(var) == emitComp->lvaInlinedPInvokeFrameVar ||
-                    unsigned(var) == emitComp->lvaStubArgumentVar)
-                {
-                    offs -= emitMaxTmpSize;
-                }
-
-                if ((int)offs < 0)
-                {
-                    // offset is negative
-                    return size + ((int(offs) >= SCHAR_MIN) ? sizeof(char) : sizeof(int));
-                }
-#ifdef TARGET_AMD64
-                // This case arises for localloc frames
-                else
-                {
-                    return size + ((offs <= SCHAR_MAX) ? sizeof(char) : sizeof(int));
-                }
-#endif
-            }
-
-            if (emitComp->lvaTempsHaveLargerOffsetThanVars() == false)
-            {
-                offs += emitMaxTmpSize;
-            }
-        }
-    }
-
-    assert((int)offs >= 0);
-
-#if !FEATURE_FIXED_OUT_ARGS
-
-    /* Are we addressing off of ESP? */
-
-    if (!emitHasFramePtr)
-    {
-        /* Adjust the effective offset if necessary */
-
-        if (emitCntStackDepth)
-            offs += emitCurStackLvl;
-
-        // we could (and used to) check for the special case [sp] here but the stack offset
-        // estimator was off, and there is very little harm in overestimating for such a
-        // rare case.
-    }
-
-#endif // !FEATURE_FIXED_OUT_ARGS
-
-//  printf("lcl = %04X, tmp = %04X, stk = %04X, offs = %04X\n",
-//         emitLclSize, emitMaxTmpSize, emitCurStackLvl, offs);
-
-#ifdef TARGET_AMD64
-    bool useSmallEncoding = (SCHAR_MIN <= (int)offs) && ((int)offs <= SCHAR_MAX);
-#else
-    bool useSmallEncoding = (offs <= size_t(SCHAR_MAX));
-#endif
-
-    // If it is ESP based, and the offset is zero, we will not encode the disp part.
-    if (!EBPbased && offs == 0)
-    {
-        return size;
-    }
-    else
-    {
-        return size + (useSmallEncoding ? sizeof(char) : sizeof(int));
-    }
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp)
-{
-    assert(id->idIns() != INS_invalid);
-    instruction    ins      = id->idIns();
-    emitAttr       attrSize = id->idOpSize();
-    UNATIVE_OFFSET prefix   = emitGetAdjustedSize(ins, attrSize, code);
-
-    // REX prefix
-    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
-        IsExtendedReg(id->idReg2(), attrSize))
-    {
-        prefix += emitGetRexPrefixSize(ins);
-    }
-
-    return prefix + emitInsSizeSV(code, var, dsp);
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val)
-{
-    assert(id->idIns() != INS_invalid);
-    instruction    ins       = id->idIns();
-    emitAttr       attrSize  = id->idOpSize();
-    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(attrSize);
-    UNATIVE_OFFSET prefix    = emitGetAdjustedSize(ins, attrSize, code);
-    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(valSize <= sizeof(int) || !id->idIsCnsReloc());
-#endif // TARGET_AMD64
-
-    if (valSize > sizeof(int))
-    {
-        valSize = sizeof(int);
-    }
-
-    if (id->idIsCnsReloc())
-    {
-        valInByte = false; // relocs can't be placed in a byte
-        assert(valSize == sizeof(int));
-    }
-
-    if (valInByte)
-    {
-        valSize = sizeof(char);
-    }
-    else
-    {
-        assert(!IsSSEOrAVXInstruction(ins));
-    }
-
-    // 64-bit operand instructions will need a REX.W prefix
-    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
-        IsExtendedReg(id->idReg2(), attrSize))
-    {
-        prefix += emitGetRexPrefixSize(ins);
-    }
-
-    return prefix + valSize + emitInsSizeSV(code, var, dsp);
-}
-
-/*****************************************************************************/
-
-//static bool baseRegisterRequiresSibByte(regNumber base)
-//{
-//#ifdef TARGET_AMD64
-//    return base == REG_ESP || base == REG_R12;
-//#else
-//    return base == REG_ESP;
-//#endif
-//}
-
-//static bool baseRegisterRequiresDisplacement(regNumber base)
-//{
-//#ifdef TARGET_AMD64
-//    return base == REG_EBP || base == REG_R13;
-//#else
-//    return base == REG_EBP;
-//#endif
-//}
-
-UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
-{
-    assert(false);
-    return 0;
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code, int val)
-{
-    assert(id->idIns() != INS_invalid);
-    instruction    ins       = id->idIns();
-    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
-    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
-
-    // We should never generate BT mem,reg because it has poor performance. BT mem,imm might be useful
-    // but it requires special handling of the immediate value (it is always encoded in a byte).
-    // Let's not complicate things until this is needed.
-    assert(ins != INS_bt);
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(valSize <= sizeof(INT32) || !id->idIsCnsReloc());
-#endif // TARGET_AMD64
-
-    if (valSize > sizeof(INT32))
-    {
-        valSize = sizeof(INT32);
-    }
-
-    if (id->idIsCnsReloc())
-    {
-        valInByte = false; // relocs can't be placed in a byte
-        assert(valSize == sizeof(INT32));
-    }
-
-    if (valInByte)
-    {
-        valSize = sizeof(char);
-    }
-    else
-    {
-        assert(!IsSSEOrAVXInstruction(ins));
-    }
-
-    return valSize + emitInsSizeAM(id, code);
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code)
-{
-    assert(id->idIns() != INS_invalid);
-    instruction ins      = id->idIns();
-    emitAttr    attrSize = id->idOpSize();
-
-    // fgMorph changes any statics that won't fit into 32-bit addresses
-    // into constants with an indir, rather than GT_CLS_VAR
-    // so we should only hit this path for statics that are RIP-relative
-    UNATIVE_OFFSET size = sizeof(INT32);
-
-    size += emitGetAdjustedSize(ins, attrSize, code);
-
-    // 64-bit operand instructions will need a REX.W prefix
-    if (TakesRexWPrefix(ins, attrSize) || IsExtendedReg(id->idReg1(), attrSize) ||
-        IsExtendedReg(id->idReg2(), attrSize))
-    {
-        size += emitGetRexPrefixSize(ins);
-    }
-
-    return size + emitInsSize(code);
-}
-
-inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code, int val)
-{
-    instruction    ins       = id->idIns();
-    UNATIVE_OFFSET valSize   = EA_SIZE_IN_BYTES(id->idOpSize());
-    bool           valInByte = ((signed char)val == val) && (ins != INS_mov) && (ins != INS_test);
-
-#ifndef TARGET_AMD64
-    // occasionally longs get here on x86
-    if (valSize > sizeof(INT32))
-        valSize = sizeof(INT32);
-#endif // !TARGET_AMD64
-
-    if (id->idIsCnsReloc())
-    {
-        valInByte = false; // relocs can't be placed in a byte
-        assert(valSize == sizeof(INT32));
-    }
-
-    if (valInByte)
-    {
-        valSize = sizeof(char);
-    }
-    else
-    {
-        assert(!IsSSEOrAVXInstruction(ins));
-    }
-
-    return valSize + emitInsSizeCV(id, code);
-}
-
-/*****************************************************************************
- *
- *  Allocate instruction descriptors for instructions with address modes.
- */
-
-inline emitter::instrDesc* emitter::emitNewInstrAmd(emitAttr size, ssize_t dsp)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Set the displacement field in an instruction. Only handles instrDescAmd type.
- */
-
-inline void emitter::emitSetAmdDisp(instrDescAmd* id, ssize_t dsp)
-{
-    if (dsp < AM_DISP_MIN || dsp > AM_DISP_MAX)
-    {
-        id->idSetIsLargeDsp();
-#ifdef DEBUG
-        id->idAddr()->iiaAddrMode.amDisp = AM_DISP_BIG_VAL;
-#endif
-        id->idaAmdVal = dsp;
-    }
-    else
-    {
-        id->idSetIsSmallDsp();
-        id->idAddr()->iiaAddrMode.amDisp = dsp;
-        assert(id->idAddr()->iiaAddrMode.amDisp == dsp); // make sure the value fit
-    }
-}
-
-/*****************************************************************************
- *
- *  Allocate an instruction descriptor for an instruction that uses both
- *  an address mode displacement and a constant.
- */
-
-emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int cns)
-{
-    assert(false);
-    return 0;
-}
-
-//-----------------------------------------------------------------------------
-//
-//  The next instruction will be a loop head entry point
-//  So insert an alignment instruction here to ensure that
-//  we can properly align the code.
-//
-void emitter::emitLoopAlign(unsigned short paddingBytes)
-{
-    assert(false);
-}
-
-//-----------------------------------------------------------------------------
-//
-//  The next instruction will be a loop head entry point
-//  So insert alignment instruction(s) here to ensure that
-//  we can properly align the code.
-//
-//  This emits more than one `INS_align` instruction depending on the
-//  alignmentBoundary parameter.
-//
-void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
-{
-    assert(false);
-}
-
-/*****************************************************************************
- *
- *  Add a NOP instruction of the given size.
- */
-
-void emitter::emitIns_Nop(unsigned size)
-{
-    assert(size <= MAX_ENCODED_SIZE);
-
-    instrDesc* id = emitNewInstr();
-    id->idIns(INS_nop);
-    id->idInsFmt(IF_NONE);
-    id->idCodeSize(size);
-
-    dispIns(id);
-    emitCurIGsize += size;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with no operands.
- */
-void emitter::emitIns(instruction ins)
-{
-    assert(false);
-}
-
-// Add an instruction with no operands, but whose encoding depends on the size
-// (Only CDQ/CQO currently)
-void emitter::emitIns(instruction ins, emitAttr attr)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id   = emitNewInstr(attr);
-    code_t         code = insCodeMR(ins);
-    assert(ins == INS_cdq);
-    assert((code & 0xFFFFFF00) == 0);
-    sz = 1;
-
-    insFormat fmt = IF_NONE;
-
-    sz += emitGetAdjustedSize(ins, attr, code);
-    if (TakesRexWPrefix(ins, attr))
-    {
-        sz += emitGetRexPrefixSize(ins);
-    }
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitMapFmtForIns: map the instruction format based on the instruction.
-// Shift-by-a-constant instructions have a special format.
-//
-// Arguments:
-//    fmt - the instruction format to map
-//    ins - the instruction
-//
-// Returns:
-//    The mapped instruction format.
-//
-emitter::insFormat emitter::emitMapFmtForIns(insFormat fmt, instruction ins)
-{
-    switch (ins)
-    {
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-        {
-            switch (fmt)
-            {
-                case IF_RRW_CNS:
-                    return IF_RRW_SHF;
-                case IF_MRW_CNS:
-                    return IF_MRW_SHF;
-                case IF_SRW_CNS:
-                    return IF_SRW_SHF;
-                case IF_ARW_CNS:
-                    return IF_ARW_SHF;
-                default:
-                    unreached();
-            }
-        }
-
-        default:
-            return fmt;
-    }
-}
-
-//------------------------------------------------------------------------
-// emitMapFmtAtoM: map the address mode formats ARD, ARW, and AWR to their direct address equivalents.
-//
-// Arguments:
-//    fmt - the instruction format to map
-//
-// Returns:
-//    The mapped instruction format.
-//
-emitter::insFormat emitter::emitMapFmtAtoM(insFormat fmt)
-{
-    switch (fmt)
-    {
-        case IF_ARD:
-            return IF_MRD;
-        case IF_AWR:
-            return IF_MWR;
-        case IF_ARW:
-            return IF_MRW;
-
-        case IF_RRD_ARD:
-            return IF_RRD_MRD;
-        case IF_RWR_ARD:
-            return IF_RWR_MRD;
-        case IF_RWR_ARD_CNS:
-            return IF_RWR_MRD_CNS;
-        case IF_RRW_ARD:
-            return IF_RRW_MRD;
-        case IF_RRW_ARD_CNS:
-            return IF_RRW_MRD_CNS;
-        case IF_RWR_RRD_ARD:
-            return IF_RWR_RRD_MRD;
-        case IF_RWR_RRD_ARD_CNS:
-            return IF_RWR_RRD_MRD_CNS;
-        case IF_RWR_RRD_ARD_RRD:
-            return IF_RWR_RRD_MRD_RRD;
-
-        case IF_ARD_RRD:
-            return IF_MRD_RRD;
-        case IF_AWR_RRD:
-            return IF_MWR_RRD;
-        case IF_ARW_RRD:
-            return IF_MRW_RRD;
-
-        case IF_ARD_CNS:
-            return IF_MRD_CNS;
-        case IF_AWR_CNS:
-            return IF_MWR_CNS;
-        case IF_ARW_CNS:
-            return IF_MRW_CNS;
-
-        case IF_AWR_RRD_CNS:
-            return IF_MWR_RRD_CNS;
-
-        case IF_ARW_SHF:
-            return IF_MRW_SHF;
-
-        default:
-            unreached();
-    }
-}
-
-//------------------------------------------------------------------------
-// emitHandleMemOp: For a memory operand, fill in the relevant fields of the instrDesc.
-//
-// Arguments:
-//    indir - the memory operand.
-//    id - the instrDesc to fill in.
-//    fmt - the instruction format to use. This must be one of the ARD, AWR, or ARW formats. If necessary (such as for
-//          GT_CLS_VAR_ADDR), this function will map it to the correct format.
-//    ins - the instruction we are generating. This might affect the instruction format we choose.
-//
-// Assumptions:
-//    The correctly sized instrDesc must already be created, e.g., via emitNewInstrAmd() or emitNewInstrAmdCns();
-//
-// Post-conditions:
-//    For base address of int constant:
-//        -- the caller must have added the int constant base to the instrDesc when creating it via
-//           emitNewInstrAmdCns().
-//    For simple address modes (base + scale * index + offset):
-//        -- the base register, index register, and scale factor are set.
-//        -- the caller must have added the addressing mode offset int constant to the instrDesc when creating it via
-//           emitNewInstrAmdCns().
-//
-//    The instruction format is set.
-//
-//    idSetIsDspReloc() is called if necessary.
-//
-void emitter::emitHandleMemOp(GenTreeIndir* indir, instrDesc* id, insFormat fmt, instruction ins)
-{
-    assert(false);
-}
-
-// Takes care of storing all incoming register parameters
-// into its corresponding shadow space (defined by the x64 ABI)
-void emitter::spillIntArgRegsToShadowSlots()
-{
-    unsigned       argNum;
-    instrDesc*     id;
-    UNATIVE_OFFSET sz;
-
-    assert(emitComp->compGeneratingProlog);
-
-    for (argNum = 0; argNum < MAX_REG_ARG; ++argNum)
-    {
-        regNumber argReg = intArgRegs[argNum];
-
-        // The offsets for the shadow space start at RSP + 8
-        // (right before the caller return address)
-        int offset = (argNum + 1) * EA_PTRSIZE;
-
-        id = emitNewInstrAmd(EA_PTRSIZE, offset);
-        id->idIns(INS_mov);
-        id->idInsFmt(IF_AWR_RRD);
-        id->idAddr()->iiaAddrMode.amBaseReg = REG_SPBASE;
-        id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-        id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(1);
-
-        // The offset has already been set in the intrDsc ctor,
-        // make sure we got it right.
-        assert(emitGetInsAmdAny(id) == ssize_t(offset));
-
-        id->idReg1(argReg);
-        sz = emitInsSizeAM(id, insCodeMR(INS_mov));
-        id->idCodeSize(sz);
-        emitCurIGsize += sz;
-    }
-}
-
-//------------------------------------------------------------------------
-// emitInsLoadInd: Emits a "mov reg, [mem]" (or a variant such as "movzx" or "movss")
-// instruction for a GT_IND node.
-//
-// Arguments:
-//    ins - the instruction to emit
-//    attr - the instruction operand size
-//    dstReg - the destination register
-//    mem - the GT_IND node
-//
-void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem)
-{
-    assert(mem->OperIs(GT_IND, GT_NULLCHECK));
-
-    GenTree* addr = mem->Addr();
-
-    if (addr->OperGet() == GT_CLS_VAR_ADDR)
-    {
-        emitIns_R_C(ins, attr, dstReg, addr->AsClsVar()->gtClsVarHnd, 0);
-        return;
-    }
-
-    if (addr->OperIs(GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR))
-    {
-        GenTreeLclVarCommon* varNode = addr->AsLclVarCommon();
-        unsigned             offset  = varNode->GetLclOffs();
-        emitIns_R_S(ins, attr, dstReg, varNode->GetLclNum(), offset);
-
-        // Updating variable liveness after instruction was emitted.
-        // TODO-Review: it appears that this call to genUpdateLife does nothing because it
-        // returns quickly when passed GT_LCL_VAR_ADDR or GT_LCL_FLD_ADDR. Below, emitInsStoreInd
-        // had similar code that replaced `varNode` with `mem` (to fix a GC hole). It might be
-        // appropriate to do that here as well, but doing so showed no asm diffs, so it's not
-        // clear when this scenario gets hit, at least for GC refs.
-        codeGen->genUpdateLife(varNode);
-        return;
-    }
-
-    assert(addr->OperIsAddrMode() || (addr->IsCnsIntOrI() && addr->isContained()) || !addr->isContained());
-    ssize_t    offset = mem->Offset();
-    instrDesc* id     = emitNewInstrAmd(attr, offset);
-    id->idIns(ins);
-    id->idReg1(dstReg);
-    emitHandleMemOp(mem, id, IF_RWR_ARD, ins);
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitInsStoreInd: Emits a "mov [mem], reg/imm" (or a variant such as "movss")
-// instruction for a GT_STOREIND node.
-//
-// Arguments:
-//    ins - the instruction to emit
-//    attr - the instruction operand size
-//    mem - the GT_STOREIND node
-//
-void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* mem)
-{
-    assert(mem->OperIs(GT_STOREIND));
-
-    GenTree* addr = mem->Addr();
-    GenTree* data = mem->Data();
-
-    if (addr->OperGet() == GT_CLS_VAR_ADDR)
-    {
-        if (data->isContainedIntOrIImmed())
-        {
-            emitIns_C_I(ins, attr, addr->AsClsVar()->gtClsVarHnd, 0, (int)data->AsIntConCommon()->IconValue());
-        }
-        else
-        {
-            assert(!data->isContained());
-            emitIns_C_R(ins, attr, addr->AsClsVar()->gtClsVarHnd, data->GetRegNum(), 0);
-        }
-        return;
-    }
-
-    if (addr->OperIs(GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR))
-    {
-        GenTreeLclVarCommon* varNode = addr->AsLclVarCommon();
-        unsigned             offset  = varNode->GetLclOffs();
-        if (data->isContainedIntOrIImmed())
-        {
-            emitIns_S_I(ins, attr, varNode->GetLclNum(), offset, (int)data->AsIntConCommon()->IconValue());
-        }
-        else
-        {
-            assert(!data->isContained());
-            emitIns_S_R(ins, attr, data->GetRegNum(), varNode->GetLclNum(), offset);
-        }
-
-        // Updating variable liveness after instruction was emitted
-        codeGen->genUpdateLife(mem);
-        return;
-    }
-
-    ssize_t        offset = mem->Offset();
-    UNATIVE_OFFSET sz;
-    instrDesc*     id;
-
-    if (data->isContainedIntOrIImmed())
-    {
-        int icon = (int)data->AsIntConCommon()->IconValue();
-        id       = emitNewInstrAmdCns(attr, offset, icon);
-        id->idIns(ins);
-        emitHandleMemOp(mem, id, IF_AWR_CNS, ins);
-        sz = emitInsSizeAM(id, insCodeMI(ins), icon);
-        id->idCodeSize(sz);
-    }
-    else
-    {
-        assert(!data->isContained());
-        id = emitNewInstrAmd(attr, offset);
-        id->idIns(ins);
-        emitHandleMemOp(mem, id, IF_AWR_RRD, ins);
-        id->idReg1(data->GetRegNum());
-        sz = emitInsSizeAM(id, insCodeMR(ins));
-        id->idCodeSize(sz);
-    }
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitInsStoreLcl: Emits a "mov [mem], reg/imm" (or a variant such as "movss")
-// instruction for a GT_STORE_LCL_VAR node.
-//
-// Arguments:
-//    ins - the instruction to emit
-//    attr - the instruction operand size
-//    varNode - the GT_STORE_LCL_VAR node
-//
-void emitter::emitInsStoreLcl(instruction ins, emitAttr attr, GenTreeLclVarCommon* varNode)
-{
-    assert(varNode->OperIs(GT_STORE_LCL_VAR));
-    assert(varNode->GetRegNum() == REG_NA); // stack store
-
-    GenTree* data = varNode->gtGetOp1();
-    codeGen->inst_set_SV_var(varNode);
-
-    if (data->isContainedIntOrIImmed())
-    {
-        emitIns_S_I(ins, attr, varNode->GetLclNum(), 0, (int)data->AsIntConCommon()->IconValue());
-    }
-    else
-    {
-        assert(!data->isContained());
-        emitIns_S_R(ins, attr, data->GetRegNum(), varNode->GetLclNum(), 0);
-    }
-
-    // Updating variable liveness after instruction was emitted
-    codeGen->genUpdateLife(varNode);
-}
-
-//------------------------------------------------------------------------
-// emitInsBinary: Emits an instruction for a node which takes two operands
-//
-// Arguments:
-//    ins - the instruction to emit
-//    attr - the instruction operand size
-//    dst - the destination and first source operand
-//    src - the second source operand
-//
-// Assumptions:
-//  i) caller of this routine needs to call genConsumeReg()
-// ii) caller of this routine needs to call genProduceReg()
-regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src)
-{
-    assert(false);
-    return (regNumber)0;
-}
-
-//------------------------------------------------------------------------
-// emitInsRMW: Emit logic for Read-Modify-Write binary instructions.
-//
-// Responsible for emitting a single instruction that will perform an operation of the form:
-//      *addr = *addr <BinOp> src
-// For example:
-//      ADD [RAX], RCX
-//
-// Arguments:
-//    ins - instruction to generate
-//    attr - emitter attribute for instruction
-//    storeInd - indir for RMW addressing mode
-//    src - source operand of instruction
-//
-// Assumptions:
-//    Lowering has taken care of recognizing the StoreInd pattern of:
-//          StoreInd( AddressTree, BinOp( Ind ( AddressTree ), Operand ) )
-//    The address to store is already sitting in a register.
-//
-// Notes:
-//    This is a no-produce operation, meaning that no register output will
-//    be produced for future use in the code stream.
-//
-void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeInd, GenTree* src)
-{
-    GenTree* addr = storeInd->Addr();
-    addr          = addr->gtSkipReloadOrCopy();
-    assert(addr->OperIs(GT_LCL_VAR, GT_LCL_VAR_ADDR, GT_LEA, GT_CLS_VAR_ADDR, GT_CNS_INT));
-
-    instrDesc*     id = nullptr;
-    UNATIVE_OFFSET sz;
-
-    ssize_t offset = 0;
-    if (addr->OperGet() != GT_CLS_VAR_ADDR)
-    {
-        offset = storeInd->Offset();
-    }
-
-    if (src->isContainedIntOrIImmed())
-    {
-        GenTreeIntConCommon* intConst = src->AsIntConCommon();
-        int                  iconVal  = (int)intConst->IconValue();
-        switch (ins)
-        {
-            case INS_rcl_N:
-            case INS_rcr_N:
-            case INS_rol_N:
-            case INS_ror_N:
-            case INS_shl_N:
-            case INS_shr_N:
-            case INS_sar_N:
-                iconVal &= 0x7F;
-                break;
-            default:
-                break;
-        }
-
-        id = emitNewInstrAmdCns(attr, offset, iconVal);
-        emitHandleMemOp(storeInd, id, IF_ARW_CNS, ins);
-        id->idIns(ins);
-        sz = emitInsSizeAM(id, insCodeMI(ins), iconVal);
-    }
-    else
-    {
-        assert(!src->isContained()); // there must be one non-contained src
-
-        // ind, reg
-        id = emitNewInstrAmd(attr, offset);
-        emitHandleMemOp(storeInd, id, IF_ARW_RRD, ins);
-        id->idReg1(src->GetRegNum());
-        id->idIns(ins);
-        sz = emitInsSizeAM(id, insCodeMR(ins));
-    }
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitInsRMW: Emit logic for Read-Modify-Write unary instructions.
-//
-// Responsible for emitting a single instruction that will perform an operation of the form:
-//      *addr = UnaryOp *addr
-// For example:
-//      NOT [RAX]
-//
-// Arguments:
-//    ins - instruction to generate
-//    attr - emitter attribute for instruction
-//    storeInd - indir for RMW addressing mode
-//
-// Assumptions:
-//    Lowering has taken care of recognizing the StoreInd pattern of:
-//          StoreInd( AddressTree, UnaryOp( Ind ( AddressTree ) ) )
-//    The address to store is already sitting in a register.
-//
-// Notes:
-//    This is a no-produce operation, meaning that no register output will
-//    be produced for future use in the code stream.
-//
-void emitter::emitInsRMW(instruction ins, emitAttr attr, GenTreeStoreInd* storeInd)
-{
-    GenTree* addr = storeInd->Addr();
-    addr          = addr->gtSkipReloadOrCopy();
-    assert(addr->OperIs(GT_LCL_VAR, GT_LCL_VAR_ADDR, GT_CLS_VAR_ADDR, GT_LEA, GT_CNS_INT));
-
-    ssize_t offset = 0;
-    if (addr->OperGet() != GT_CLS_VAR_ADDR)
-    {
-        offset = storeInd->Offset();
-    }
-
-    instrDesc* id = emitNewInstrAmd(attr, offset);
-    emitHandleMemOp(storeInd, id, IF_ARW, ins);
-    id->idIns(ins);
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction referencing a single register.
- */
-
-void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
-{
-    emitAttr size = EA_SIZE(attr);
-
-    assert(size <= EA_PTRSIZE);
-    noway_assert(emitVerifyEncodable(ins, size, reg));
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrSmall(attr);
-
-    switch (ins)
-    {
-        case INS_inc:
-        case INS_dec:
-#ifdef TARGET_AMD64
-
-            sz = 2; // x64 has no 1-byte opcode (it is the same encoding as the REX prefix)
-
-#else // !TARGET_AMD64
-
-            if (size == EA_1BYTE)
-                sz = 2; // Use the long form as the small one has no 'w' bit
-            else
-                sz = 1; // Use short form
-
-#endif // !TARGET_AMD64
-
-            break;
-
-        case INS_pop:
-        case INS_pop_hide:
-        case INS_push:
-        case INS_push_hide:
-
-            /* We don't currently push/pop small values */
-
-            assert(size == EA_PTRSIZE);
-
-            sz = 1;
-            break;
-
-        default:
-
-            /* All the sixteen INS_setCCs are contiguous. */
-
-            if (INS_seto <= ins && ins <= INS_setg)
-            {
-                // Rough check that we used the endpoints for the range check
-
-                assert(INS_seto + 0xF == INS_setg);
-
-                // The caller must specify EA_1BYTE for 'attr'
-
-                assert(attr == EA_1BYTE);
-
-                /* We expect this to always be a 'big' opcode */
-
-                assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000);
-
-                size = attr;
-
-                sz = 3;
-                break;
-            }
-            else
-            {
-                sz = 2;
-                break;
-            }
-    }
-    insFormat fmt = emitInsModeFormat(ins, IF_RRD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(reg);
-
-    // Vex bytes
-    sz += emitGetAdjustedSize(ins, attr, insEncodeMRreg(ins, reg, attr, insCodeMR(ins)));
-
-    // REX byte
-    if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr))
-    {
-        sz += emitGetRexPrefixSize(ins);
-    }
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-/*****************************************************************************
- *
- *  Add an instruction referencing a register and a constant.
- */
-
-void emitter::emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, ssize_t val)
-{
-    assert(false);
-}
-
-/*****************************************************************************
- *
- *  Add an instruction referencing an integer constant.
- */
-
-void emitter::emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id;
-    bool           valInByte = ((signed char)val == (target_ssize_t)val);
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    if (EA_IS_CNS_RELOC(attr))
-    {
-        valInByte = false; // relocs can't be placed in a byte
-    }
-
-    switch (ins)
-    {
-        case INS_loop:
-        case INS_jge:
-            sz = 2;
-            break;
-
-        case INS_ret:
-            sz = 3;
-            break;
-
-        case INS_push_hide:
-        case INS_push:
-            sz = valInByte ? 2 : 5;
-            break;
-
-        default:
-            NO_WAY("unexpected instruction");
-    }
-
-    id = emitNewInstrSC(attr, val);
-    id->idIns(ins);
-    id->idInsFmt(IF_CNS);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-/*****************************************************************************
- *
- *  Add a "jump through a table" instruction.
- */
-
-void emitter::emitIns_IJ(emitAttr attr, regNumber reg, unsigned base)
-{
-    assert(EA_SIZE(attr) == EA_4BYTE);
-
-    UNATIVE_OFFSET    sz  = 3 + 4;
-    const instruction ins = INS_i_jmp;
-
-    if (IsExtendedReg(reg, attr))
-    {
-        sz += emitGetRexPrefixSize(ins);
-    }
-
-    instrDesc* id = emitNewInstrAmd(attr, base);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_ARD);
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = reg;
-    id->idAddr()->iiaAddrMode.amScale   = emitter::OPSZP;
-
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idMemCookie = base;
-#endif
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with a static data member operand. If 'size' is 0, the
- *  instruction operates on the address of the static member instead of its
- *  value (e.g. "push offset clsvar", rather than "push dword ptr [clsvar]").
- */
-
-void emitter::emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, int offs)
-{
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id;
-
-    /* Are we pushing the offset of the class variable? */
-
-    if (EA_IS_OFFSET(attr))
-    {
-        assert(ins == INS_push);
-        sz = 1 + TARGET_POINTER_SIZE;
-
-        id = emitNewInstrDsp(EA_1BYTE, offs);
-        id->idIns(ins);
-        id->idInsFmt(IF_MRD_OFF);
-    }
-    else
-    {
-        insFormat fmt = emitInsModeFormat(ins, IF_MRD);
-
-        id = emitNewInstrDsp(attr, offs);
-        id->idIns(ins);
-        id->idInsFmt(fmt);
-        sz = emitInsSizeCV(id, insCodeMR(ins));
-    }
-
-    if (TakesRexWPrefix(ins, attr))
-    {
-        // REX.W prefix
-        sz += emitGetRexPrefixSize(ins);
-    }
-
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with two register operands.
- */
-
-void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2)
-{
-    emitAttr size = EA_SIZE(attr);
-
-    /* We don't want to generate any useless mov instructions! */
-    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef TARGET_AMD64
-    // Same-reg 4-byte mov can be useful because it performs a
-    // zero-extension to 8 bytes.
-    assert(ins != INS_mov || reg1 != reg2 || size == EA_4BYTE);
-#else
-    assert(ins != INS_mov || reg1 != reg2);
-#endif // TARGET_AMD64
-
-    assert(size <= EA_32BYTE);
-    noway_assert(emitVerifyEncodable(ins, size, reg1, reg2));
-
-    UNATIVE_OFFSET sz = emitInsSizeRR(ins, reg1, reg2, attr);
-
-    /* Special case: "XCHG" uses a different format */
-    insFormat fmt = (ins == INS_xchg) ? IF_RRW_RRW : emitInsModeFormat(ins, IF_RRD_RRD);
-
-    instrDesc* id = emitNewInstrSmall(attr);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with two register operands and an integer constant.
- */
-
-void emitter::emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival)
-{
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    instrDesc* id = emitNewInstrSC(attr, ival);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RRW_RRW_CNS);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-
-    code_t code = 0;
-
-    switch (ins)
-    {
-        case INS_pextrb:
-        case INS_pextrd:
-        case INS_pextrq:
-        case INS_pextrw_sse41:
-        case INS_extractps:
-        case INS_vextractf128:
-        case INS_vextracti128:
-        case INS_shld:
-        case INS_shrd:
-        {
-            code = insCodeMR(ins);
-            break;
-        }
-
-        case INS_psrldq:
-        case INS_pslldq:
-        {
-            code = insCodeMI(ins);
-            break;
-        }
-
-        default:
-        {
-            code = insCodeRM(ins);
-            break;
-        }
-    }
-
-    UNATIVE_OFFSET sz = emitInsSizeRR(id, code, ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs)
-{
-    assert(ins == INS_prefetcht0 || ins == INS_prefetcht1 || ins == INS_prefetcht2 || ins == INS_prefetchnta);
-
-    instrDesc* id = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-
-    id->idInsFmt(IF_ARD);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitIns_AR_R_R: emits the code for an instruction that takes a base memory register, two register operands
-//                 and that does not return a value
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op2Reg    -- The register of the second operand
-//    op3Reg    -- The register of the third operand
-//    base      -- The base register used for the memory address (first operand)
-//    offs      -- The offset from base
-//
-void emitter::emitIns_AR_R_R(
-    instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-    id->idReg1(op2Reg);
-    id->idReg2(op3Reg);
-
-    id->idInsFmt(IF_AWR_RRD_RRD);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir)
-{
-    ssize_t    offs = indir->Offset();
-    instrDesc* id   = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-
-    emitHandleMemOp(indir, id, IF_RRW_ARD, ins);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival)
-{
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
-
-    ssize_t    offs = indir->Offset();
-    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-
-    emitHandleMemOp(indir, id, IF_RRW_ARD_CNS, ins);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival)
-{
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-
-    id->idInsFmt(IF_RRW_ARD_CNS);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_C_I(
-    instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
-{
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RRW_MRD_CNS);
-    id->idReg1(reg1);
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival)
-{
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
-    assert(IsSSEOrAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrCns(attr, ival);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RRW_SRD_CNS);
-    id->idReg1(reg1);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstr(attr);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RWR_RRD_SRD);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    ssize_t    offs = indir->Offset();
-    instrDesc* id   = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-
-    emitHandleMemOp(indir, id, IF_RWR_RRD_ARD, ins);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-
-    id->idInsFmt(IF_RWR_RRD_ARD);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather
-//
-// Arguments:
-//    ins - the instruction to check
-// Return Value:
-//    true if the instruction is AVX2 Gather
-//
-bool IsAVX2GatherInstruction(instruction ins)
-{
-    switch (ins)
-    {
-        case INS_vpgatherdd:
-        case INS_vpgatherdq:
-        case INS_vpgatherqd:
-        case INS_vpgatherqq:
-        case INS_vgatherdps:
-        case INS_vgatherdpd:
-        case INS_vgatherqps:
-        case INS_vgatherqpd:
-            return true;
-        default:
-            return false;
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_R_AR_R: Emits an AVX2 Gather instructions
-//
-// Arguments:
-//    ins - the instruction to emit
-//    attr - the instruction operand size
-//    reg1 - the destination and first source operand
-//    reg2 - the mask operand (encoded in VEX.vvvv)
-//    base - the base register of address to load
-//    index - the index register of VSIB
-//    scale - the scale number of VSIB
-//    offs - the offset added to the memory address from base
-//
-void emitter::emitIns_R_AR_R(instruction ins,
-                             emitAttr    attr,
-                             regNumber   reg1,
-                             regNumber   reg2,
-                             regNumber   base,
-                             regNumber   index,
-                             int         scale,
-                             int         offs)
-{
-    assert(IsAVX2GatherInstruction(ins));
-
-    instrDesc* id = emitNewInstrAmd(attr, offs);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-
-    id->idInsFmt(IF_RWR_ARD_RRD);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = index;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeSize((emitAttr)scale);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_C(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    instrDesc* id = emitNewInstrDsp(attr, offs);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RWR_RRD_MRD);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
-*
-*  Add an instruction with three register operands.
-*/
-
-void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2)
-{
-    assert(false);
-}
-
-void emitter::emitIns_R_R_AR_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrAmdCns(attr, offs, ival);
-
-    id->idIns(ins);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-
-    id->idInsFmt(IF_RWR_RRD_ARD_CNS);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_C_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    instrDesc* id = emitNewInstrCnsDsp(attr, ival, offs);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RWR_RRD_MRD_CNS);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/**********************************************************************************
-* emitIns_R_R_R_I: Add an instruction with three register operands and an immediate.
-*
-* Arguments:
-*    ins       - the instruction to add
-*    attr      - the emitter attribute for instruction
-*    targetReg - the target (destination) register
-*    reg1      - the first source register
-*    reg2      - the second source register
-*    ival      - the immediate value
-*/
-
-void emitter::emitIns_R_R_R_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, int ival)
-{
-    assert(false);
-}
-
-void emitter::emitIns_R_R_S_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival)
-{
-    assert(IsSSEOrAVXInstruction(ins));
-    assert(IsThreeOperandAVXInstruction(ins));
-
-    instrDesc* id = emitNewInstrCns(attr, ival);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_RWR_RRD_SRD_CNS);
-    id->idReg1(reg1);
-    id->idReg2(reg2);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// encodeXmmRegAsIval: Encodes a XMM register into imm[7:4] for use by a SIMD instruction
-//
-// Arguments
-//    opReg -- The register being encoded
-//
-// Returns:
-//    opReg encoded in imm[7:4]
-static int encodeXmmRegAsIval(regNumber opReg)
-{
-    // AVX/AVX2 supports 4-reg format for vblendvps/vblendvpd/vpblendvb,
-    // which encodes the fourth register into imm8[7:4]
-    assert(opReg >= XMMBASE);
-    int ival = (opReg - XMMBASE) << 4;
-
-    assert((ival >= 0) && (ival <= 255));
-    return (int8_t)ival;
-}
-
-//------------------------------------------------------------------------
-// emitIns_R_R_A_R: emits the code for an instruction that takes a register operand, a GenTreeIndir address,
-//                  another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    indir     -- The GenTreeIndir used for the memory address
-//
-// Remarks:
-//    op2 is built from indir
-//
-void emitter::emitIns_R_R_A_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir)
-{
-    assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
-
-    int        ival = encodeXmmRegAsIval(op3Reg);
-    ssize_t    offs = indir->Offset();
-    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
-
-    id->idIns(ins);
-    id->idReg1(targetReg);
-    id->idReg2(op1Reg);
-
-    emitHandleMemOp(indir, id, IF_RWR_RRD_ARD_RRD, ins);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitIns_R_R_AR_R: emits the code for an instruction that takes a register operand, a base memory
-//                   register, another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operands
-//    op3Reg    -- The register of the third operand
-//    base      -- The base register used for the memory address
-//    offs      -- The offset added to the memory address from base
-//
-// Remarks:
-//    op2 is built from base + offs
-//
-void emitter::emitIns_R_R_AR_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs)
-{
-    assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
-
-    int        ival = encodeXmmRegAsIval(op3Reg);
-    instrDesc* id   = emitNewInstrAmdCns(attr, offs, ival);
-
-    id->idIns(ins);
-    id->idReg1(targetReg);
-    id->idReg2(op1Reg);
-
-    id->idInsFmt(IF_RWR_RRD_ARD_RRD);
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitIns_R_R_C_R: emits the code for an instruction that takes a register operand, a field handle +
-//                  offset,  another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
-//    offs      -- The offset added to the memory address from fldHnd
-//
-// Remarks:
-//    op2 is built from fldHnd + offs
-//
-void emitter::emitIns_R_R_C_R(instruction          ins,
-                              emitAttr             attr,
-                              regNumber            targetReg,
-                              regNumber            op1Reg,
-                              regNumber            op3Reg,
-                              CORINFO_FIELD_HANDLE fldHnd,
-                              int                  offs)
-{
-    assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
-
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    int        ival = encodeXmmRegAsIval(op3Reg);
-    instrDesc* id   = emitNewInstrCnsDsp(attr, ival, offs);
-
-    id->idIns(ins);
-    id->idReg1(targetReg);
-    id->idReg2(op1Reg);
-
-    id->idInsFmt(IF_RWR_RRD_MRD_RRD);
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins), ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-//------------------------------------------------------------------------
-// emitIns_R_R_R_S: emits the code for a instruction that takes a register operand, a variable index +
-//                  offset, another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    varx      -- The variable index used for the memory address
-//    offs      -- The offset added to the memory address from varx
-//
-// Remarks:
-//    op2 is built from varx + offs
-//
-void emitter::emitIns_R_R_S_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs)
-{
-    assert(isAvxBlendv(ins));
-    assert(UseVEXEncoding());
-
-    int        ival = encodeXmmRegAsIval(op3Reg);
-    instrDesc* id   = emitNewInstrCns(attr, ival);
-
-    id->idIns(ins);
-    id->idReg1(targetReg);
-    id->idReg2(op1Reg);
-
-    id->idInsFmt(IF_RWR_RRD_SRD_RRD);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs, ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_R_R_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3)
-{
-    assert(false);
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with a register + static member operands.
- */
-void emitter::emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO_FIELD_HANDLE fldHnd, int offs)
-{
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    emitAttr size = EA_SIZE(attr);
-
-    assert(size <= EA_32BYTE);
-    noway_assert(emitVerifyEncodable(ins, size, reg));
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id;
-
-    // Are we MOV'ing the offset of the class variable into EAX?
-    if (EA_IS_OFFSET(attr))
-    {
-        id = emitNewInstrDsp(EA_1BYTE, offs);
-        id->idIns(ins);
-        id->idInsFmt(IF_RWR_MRD_OFF);
-        id->idReg1(reg);
-
-        assert(ins == INS_mov && reg == REG_EAX);
-
-        // Special case: "mov eax, [addr]" is smaller
-        sz = 1 + TARGET_POINTER_SIZE;
-    }
-    else
-    {
-        insFormat fmt = emitInsModeFormat(ins, IF_RRD_MRD);
-
-        id = emitNewInstrDsp(attr, offs);
-        id->idIns(ins);
-        id->idInsFmt(fmt);
-        id->idReg1(reg);
-
-#ifdef TARGET_X86
-        // Special case: "mov eax, [addr]" is smaller.
-        // This case is not enabled for amd64 as it always uses RIP relative addressing
-        // and it results in smaller instruction size than encoding 64-bit addr in the
-        // instruction.
-        if (ins == INS_mov && reg == REG_EAX)
-        {
-            sz = 1 + TARGET_POINTER_SIZE;
-            if (size == EA_2BYTE)
-                sz += 1;
-        }
-        else
-#endif // TARGET_X86
-        {
-            sz = emitInsSizeCV(id, insCodeRM(ins));
-        }
-
-        // Special case: mov reg, fs:[ddd]
-        if (fldHnd == FLD_GLOBAL_FS)
-        {
-            sz += 1;
-        }
-    }
-
-    id->idCodeSize(sz);
-
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with a static member + register operands.
- */
-
-void emitter::emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, regNumber reg, int offs)
-{
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    emitAttr size = EA_SIZE(attr);
-
-#if defined(TARGET_X86)
-    // For x86 it is valid to storeind a double sized operand in an xmm reg to memory
-    assert(size <= EA_8BYTE);
-#else
-    assert(size <= EA_PTRSIZE);
-#endif
-
-    noway_assert(emitVerifyEncodable(ins, size, reg));
-
-    instrDesc* id  = emitNewInstrDsp(attr, offs);
-    insFormat  fmt = emitInsModeFormat(ins, IF_MRD_RRD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(reg);
-
-    UNATIVE_OFFSET sz;
-
-#ifdef TARGET_X86
-    // Special case: "mov [addr], EAX" is smaller.
-    // This case is not enable for amd64 as it always uses RIP relative addressing
-    // and it will result in smaller instruction size than encoding 64-bit addr in
-    // the instruction.
-    if (ins == INS_mov && reg == REG_EAX)
-    {
-        sz = 1 + TARGET_POINTER_SIZE;
-
-        if (size == EA_2BYTE)
-            sz += 1;
-
-        // REX prefix
-        if (TakesRexWPrefix(ins, attr) || IsExtendedReg(reg, attr))
-        {
-            sz += emitGetRexPrefixSize(ins);
-        }
-    }
-    else
-#endif // TARGET_X86
-    {
-        sz = emitInsSizeCV(id, insCodeMR(ins));
-    }
-
-    // Special case: mov reg, fs:[ddd]
-    if (fldHnd == FLD_GLOBAL_FS)
-    {
-        sz += 1;
-    }
-
-    id->idCodeSize(sz);
-
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add an instruction with a static member + constant.
- */
-
-void emitter::emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, int offs, int val)
-{
-    // Static always need relocs
-    if (!jitStaticFldIsGlobAddr(fldHnd))
-    {
-        attr = EA_SET_FLG(attr, EA_DSP_RELOC_FLG);
-    }
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_MRW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_MRD_CNS);
-            break;
-    }
-
-    instrDesc* id = emitNewInstrCnsDsp(attr, val, offs);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idAddr()->iiaFieldHnd = fldHnd;
-
-    code_t         code = insCodeMI(ins);
-    UNATIVE_OFFSET sz   = emitInsSizeCV(id, code, val);
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_J_S(instruction ins, emitAttr attr, BasicBlock* dst, int varx, int offs)
-{
-    assert(ins == INS_mov);
-    assert(dst->bbFlags & BBF_JMP_TARGET);
-
-    instrDescLbl* id = emitNewInstrLbl();
-
-    id->idIns(ins);
-    id->idInsFmt(IF_SWR_LABEL);
-    id->idAddr()->iiaBBlabel = dst;
-
-    /* The label reference is always long */
-
-    id->idjShort    = 0;
-    id->idjKeepLong = 1;
-
-    /* Record the current IG and offset within it */
-
-    id->idjIG   = emitCurIG;
-    id->idjOffs = emitCurIGsize;
-
-    /* Append this instruction to this IG's jump list */
-
-    id->idjNext      = emitCurIGjmpList;
-    emitCurIGjmpList = id;
-
-    UNATIVE_OFFSET sz = sizeof(INT32) + emitInsSizeSV(id, insCodeMI(ins), varx, offs);
-    id->dstLclVar.initLclVarAddr(varx, offs);
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-
-#if EMITTER_STATS
-    emitTotalIGjmps++;
-#endif
-
-#ifndef TARGET_AMD64
-    // Storing the address of a basicBlock will need a reloc
-    // as the instruction uses the absolute address,
-    // not a relative address.
-    //
-    // On Amd64, Absolute code addresses should always go through a reloc to
-    // to be encoded as RIP rel32 offset.
-    if (emitComp->opts.compReloc)
-#endif
-    {
-        id->idSetIsDspReloc();
-    }
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Add a label instruction.
- */
-void emitter::emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNumber reg)
-{
-    assert(ins == INS_lea);
-    assert(dst->bbFlags & BBF_JMP_TARGET);
-
-    instrDescJmp* id = emitNewInstrJmp();
-
-    id->idIns(ins);
-    id->idReg1(reg);
-    id->idInsFmt(IF_RWR_LABEL);
-    id->idOpSize(EA_SIZE(attr)); // emitNewInstrJmp() sets the size (incorrectly) to EA_1BYTE
-    id->idAddr()->iiaBBlabel = dst;
-
-    /* The label reference is always long */
-
-    id->idjShort    = 0;
-    id->idjKeepLong = 1;
-
-    /* Record the current IG and offset within it */
-
-    id->idjIG   = emitCurIG;
-    id->idjOffs = emitCurIGsize;
-
-    /* Append this instruction to this IG's jump list */
-
-    id->idjNext      = emitCurIGjmpList;
-    emitCurIGjmpList = id;
-
-#ifdef DEBUG
-    // Mark the catch return
-    if (emitComp->compCurBB->bbJumpKind == BBJ_EHCATCHRET)
-    {
-        id->idDebugOnlyInfo()->idCatchRet = true;
-    }
-#endif // DEBUG
-
-#if EMITTER_STATS
-    emitTotalIGjmps++;
-#endif
-
-    // Set the relocation flags - these give hint to zap to perform
-    // relocation of the specified 32bit address.
-    //
-    // Note the relocation flags influence the size estimate.
-    id->idSetRelocFlags(attr);
-
-    UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  The following adds instructions referencing address modes.
- */
-
-void emitter::emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_ARW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
-            break;
-    }
-
-    /*
-    Useful if you want to trap moves with 0 constant
-    if (ins == INS_mov && val == 0 && EA_SIZE(attr) >= EA_4BYTE)
-    {
-        printf("MOV 0\n");
-    }
-    */
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = reg;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMI(ins), val);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_I_AI(instruction ins, emitAttr attr, int val, ssize_t disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_ARW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
-            break;
-    }
-
-    /*
-    Useful if you want to trap moves with 0 constant
-    if (ins == INS_mov && val == 0 && EA_SIZE(attr) >= EA_4BYTE)
-    {
-        printf("MOV 0\n");
-    }
-    */
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMI(ins), val);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber base, int disp)
-{
-    emitIns_R_ARX(ins, attr, reg, base, REG_NA, 1, disp);
-}
-
-void emitter::emitIns_R_AI(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE) && (ireg != REG_NA));
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstrAmd(attr, disp);
-    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(ireg);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp)
-{
-    emitIns_ARX_R(ins, attr, reg, base, REG_NA, 1, disp);
-}
-
-//------------------------------------------------------------------------
-// emitIns_S_R_I: emits the code for an instruction that takes a stack operand,
-//                a register operand, and an immediate.
-//
-// Arguments:
-//    ins       - The instruction being emitted
-//    attr      - The emit attribute
-//    varNum    - The varNum of the stack operand
-//    offs      - The offset for the stack operand
-//    reg       - The register operand
-//    ival      - The immediate value
-//
-void emitter::emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival)
-{
-    // This is only used for INS_vextracti128 and INS_vextractf128, and for these 'ival' must be 0 or 1.
-    assert(ins == INS_vextracti128 || ins == INS_vextractf128);
-    assert((ival == 0) || (ival == 1));
-    instrDesc* id = emitNewInstrAmdCns(attr, 0, ival);
-
-    id->idIns(ins);
-    id->idInsFmt(IF_SWR_RRD_CNS);
-    id->idReg1(reg);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varNum, offs);
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeMR(ins), varNum, offs, ival);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm)
-{
-    assert((ins == INS_vextracti128) || (ins == INS_vextractf128));
-    assert(attr == EA_32BYTE);
-    assert(reg != REG_NA);
-
-    instrDesc* id = emitNewInstrAmdCns(attr, indir->Offset(), imm);
-    id->idIns(ins);
-    id->idReg1(reg);
-    emitHandleMemOp(indir, id, IF_AWR_RRD_CNS, ins);
-    UNATIVE_OFFSET size = emitInsSizeAM(id, insCodeMR(ins), imm);
-    id->idCodeSize(size);
-    dispIns(id);
-    emitCurIGsize += size;
-}
-
-void emitter::emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmd(attr, disp);
-    insFormat      fmt;
-
-    if (ireg == REG_NA)
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD);
-    }
-    else
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
-
-        assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
-
-        id->idReg1(ireg);
-    }
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-void emitter::emitIns_I_ARR(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, int disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_ARW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
-            break;
-    }
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = reg;
-    id->idAddr()->iiaAddrMode.amIndxReg = rg2;
-    id->idAddr()->iiaAddrMode.amScale   = emitter::OPSZ1;
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMI(ins), val);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_ARR(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp)
-{
-    emitIns_R_ARX(ins, attr, reg, base, index, 1, disp);
-}
-
-void emitter::emitIns_ARR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp)
-{
-    emitIns_ARX_R(ins, attr, reg, base, index, 1, disp);
-}
-
-void emitter::emitIns_I_ARX(
-    instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, unsigned mul, int disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_ARW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
-            break;
-    }
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = reg;
-    id->idAddr()->iiaAddrMode.amIndxReg = rg2;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMI(ins), val);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_ARX(
-    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp)
-{
-    assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE) && (reg != REG_NA));
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));
-
-    if ((ins == INS_lea) && (reg == base) && (index == REG_NA) && (disp == 0))
-    {
-        // Maybe the emitter is not the common place for this optimization, but it's a better choke point
-        // for all the emitIns(ins, tree), we would have to be analyzing at each call site
-        //
-        return;
-    }
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstrAmd(attr, disp);
-    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(reg);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = index;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(scale);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_ARX_R(
-    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, cnsval_ssize_t disp)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmd(attr, disp);
-    insFormat      fmt;
-
-    if (reg == REG_NA)
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD);
-    }
-    else
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
-
-        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));
-        assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE));
-
-        id->idReg1(reg);
-    }
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = base;
-    id->idAddr()->iiaAddrMode.amIndxReg = index;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(scale);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-void emitter::emitIns_I_AX(instruction ins, emitAttr attr, int val, regNumber reg, unsigned mul, int disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_ARW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_ARD_CNS);
-            break;
-    }
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmdCns(attr, disp, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = reg;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMI(ins), val);
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp)
-{
-    assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE) && (ireg != REG_NA));
-    noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstrAmd(attr, disp);
-    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_ARD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(ireg);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = reg;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeRM(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id = emitNewInstrAmd(attr, disp);
-    insFormat      fmt;
-
-    if (ireg == REG_NA)
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD);
-    }
-    else
-    {
-        fmt = emitInsModeFormat(ins, IF_ARD_RRD);
-        noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), ireg));
-        assert((CodeGen::instIsFP(ins) == false) && (EA_SIZE(attr) <= EA_8BYTE));
-
-        id->idReg1(ireg);
-    }
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-
-    id->idAddr()->iiaAddrMode.amBaseReg = REG_NA;
-    id->idAddr()->iiaAddrMode.amIndxReg = reg;
-    id->idAddr()->iiaAddrMode.amScale   = emitEncodeScale(mul);
-
-    assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly
-
-    sz = emitInsSizeAM(id, insCodeMR(ins));
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_I: emits the code for an instruction that takes a register operand, an immediate operand
-//                     and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    ival      -- The immediate value
-//
-// Notes:
-//    This will handle the required register copy if 'op1Reg' and 'targetReg' are not the same, and
-//    the 3-operand format is not available.
-//    This is not really SIMD-specific, but is currently only used in that context, as that's
-//    where we frequently need to handle the case of generating 3-operand or 2-operand forms
-//    depending on what target ISA is supported.
-//
-void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival)
-{
-    if (UseVEXEncoding() || IsDstSrcImmAvxInstruction(ins))
-    {
-        emitIns_R_R_I(ins, attr, targetReg, op1Reg, ival);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_I(ins, attr, targetReg, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_A: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
-//                     and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    indir     -- The GenTreeIndir used for the memory address
-//
-void emitter::emitIns_SIMD_R_R_A(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_A(ins, attr, targetReg, indir);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_AR: emits the code for a SIMD instruction that takes a register operand, a base memory register,
-//                      and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    base      -- The base register used for the memory address
-//    offset    -- The memory offset
-//
-void emitter::emitIns_SIMD_R_R_AR(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_AR(ins, attr, targetReg, base, offset);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_C: emits the code for a SIMD instruction that takes a register operand, a field handle + offset,
-//                     and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
-//    offs      -- The offset added to the memory address from fldHnd
-//
-void emitter::emitIns_SIMD_R_R_C(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_C(ins, attr, targetReg, fldHnd, offs);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R: emits the code for a SIMD instruction that takes two register operands, and that returns a
-//                     value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//
-void emitter::emitIns_SIMD_R_R_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op2
-            assert(op2Reg != targetReg);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_R(ins, attr, targetReg, op2Reg);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_S: emits the code for a SIMD instruction that takes a register operand, a variable index + offset,
-//                     and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    varx      -- The variable index used for the memory address
-//    offs      -- The offset added to the memory address from varx
-//
-void emitter::emitIns_SIMD_R_R_S(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_S(ins, attr, targetReg, varx, offs);
-    }
-}
-
-#ifdef FEATURE_HW_INTRINSICS
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_A_I: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
-//                       an immediate operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    indir     -- The GenTreeIndir used for the memory address
-//    ival      -- The immediate value
-//
-void emitter::emitIns_SIMD_R_R_A_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_A_I(ins, attr, targetReg, op1Reg, indir, ival, IF_RWR_RRD_ARD_CNS);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_A_I(ins, attr, targetReg, indir, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_AR_I: emits the code for a SIMD instruction that takes a register operand, a base memory register,
-//                        an immediate operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    base      -- The base register used for the memory address
-//    ival      -- The immediate value
-//
-void emitter::emitIns_SIMD_R_R_AR_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_AR_I(ins, attr, targetReg, op1Reg, base, 0, ival);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_AR_I(ins, attr, targetReg, base, 0, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_C_I: emits the code for a SIMD instruction that takes a register operand, a field handle + offset,
-//                       an immediate operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
-//    offs      -- The offset added to the memory address from fldHnd
-//    ival      -- The immediate value
-//
-void emitter::emitIns_SIMD_R_R_C_I(instruction          ins,
-                                   emitAttr             attr,
-                                   regNumber            targetReg,
-                                   regNumber            op1Reg,
-                                   CORINFO_FIELD_HANDLE fldHnd,
-                                   int                  offs,
-                                   int                  ival)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_C_I(ins, attr, targetReg, op1Reg, fldHnd, offs, ival);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_C_I(ins, attr, targetReg, fldHnd, offs, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_I: emits the code for a SIMD instruction that takes two register operands, an immediate operand,
-//                       and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//    ival      -- The immediate value
-//
-void emitter::emitIns_SIMD_R_R_R_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int ival)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_R_I(ins, attr, targetReg, op1Reg, op2Reg, ival);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op2
-            assert(op2Reg != targetReg);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_R_I(ins, attr, targetReg, op2Reg, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_S_I: emits the code for a SIMD instruction that takes a register operand, a variable index + offset,
-//                       an imediate operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    varx      -- The variable index used for the memory address
-//    offs      -- The offset added to the memory address from varx
-//    ival      -- The immediate value
-//
-void emitter::emitIns_SIMD_R_R_S_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, int ival)
-{
-    if (UseVEXEncoding())
-    {
-        emitIns_R_R_S_I(ins, attr, targetReg, op1Reg, varx, offs, ival);
-    }
-    else
-    {
-        if (op1Reg != targetReg)
-        {
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_S_I(ins, attr, targetReg, varx, offs, ival);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_A: emits the code for a SIMD instruction that takes two register operands, a GenTreeIndir address,
-//                       and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//    indir     -- The GenTreeIndir used for the memory address
-//
-void emitter::emitIns_SIMD_R_R_R_A(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir)
-{
-    assert(IsFMAInstruction(ins));
-    assert(UseVEXEncoding());
-
-    if (op1Reg != targetReg)
-    {
-        // Ensure we aren't overwriting op2
-        assert(op2Reg != targetReg);
-
-        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-    }
-
-    emitIns_R_R_A(ins, attr, targetReg, op2Reg, indir);
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_AR: emits the code for a SIMD instruction that takes two register operands, a base memory
-//                        register, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operands
-//    op2Reg    -- The register of the second operand
-//    base      -- The base register used for the memory address
-//
-void emitter::emitIns_SIMD_R_R_R_AR(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base)
-{
-    assert(IsFMAInstruction(ins));
-    assert(UseVEXEncoding());
-
-    if (op1Reg != targetReg)
-    {
-        // Ensure we aren't overwriting op2
-        assert(op2Reg != targetReg);
-
-        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-    }
-
-    emitIns_R_R_AR(ins, attr, targetReg, op2Reg, base, 0);
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_C: emits the code for a SIMD instruction that takes two register operands, a field handle +
-//                       offset, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
-//    offs      -- The offset added to the memory address from fldHnd
-//
-void emitter::emitIns_SIMD_R_R_R_C(instruction          ins,
-                                   emitAttr             attr,
-                                   regNumber            targetReg,
-                                   regNumber            op1Reg,
-                                   regNumber            op2Reg,
-                                   CORINFO_FIELD_HANDLE fldHnd,
-                                   int                  offs)
-{
-    assert(IsFMAInstruction(ins));
-    assert(UseVEXEncoding());
-
-    if (op1Reg != targetReg)
-    {
-        // Ensure we aren't overwriting op2
-        assert(op2Reg != targetReg);
-
-        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-    }
-
-    emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs);
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_R: emits the code for a SIMD instruction that takes three register operands, and that returns a
-//                     value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//    op3Reg    -- The register of the second operand
-//
-void emitter::emitIns_SIMD_R_R_R_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg)
-{
-    if (IsFMAInstruction(ins))
-    {
-        assert(UseVEXEncoding());
-
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op2 or op3
-
-            assert(op2Reg != targetReg);
-            assert(op3Reg != targetReg);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-
-        emitIns_R_R_R(ins, attr, targetReg, op2Reg, op3Reg);
-    }
-    else if (UseVEXEncoding())
-    {
-        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
-
-        // convert SSE encoding of SSE4.1 instructions to VEX encoding
-        switch (ins)
-        {
-            case INS_blendvps:
-                ins = INS_vblendvps;
-                break;
-            case INS_blendvpd:
-                ins = INS_vblendvpd;
-                break;
-            case INS_pblendvb:
-                ins = INS_vpblendvb;
-                break;
-            default:
-                break;
-        }
-        emitIns_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3Reg);
-    }
-    else
-    {
-        assert(isSse41Blendv(ins));
-        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-        if (op3Reg != REG_XMM0)
-        {
-            // Ensure we aren't overwriting op1 or op2
-            assert(op1Reg != REG_XMM0);
-            assert(op2Reg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
-        }
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op2 or oop3 (which should be REG_XMM0)
-            assert(op2Reg != targetReg);
-            assert(targetReg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-        emitIns_R_R(ins, attr, targetReg, op2Reg);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_R_S: emits the code for a SIMD instruction that takes two register operands, a variable index +
-//                       offset, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op2Reg    -- The register of the second operand
-//    varx      -- The variable index used for the memory address
-//    offs      -- The offset added to the memory address from varx
-//
-void emitter::emitIns_SIMD_R_R_R_S(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs)
-{
-    assert(IsFMAInstruction(ins));
-    assert(UseVEXEncoding());
-
-    if (op1Reg != targetReg)
-    {
-        // Ensure we aren't overwriting op2
-        assert(op2Reg != targetReg);
-
-        emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-    }
-
-    emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs);
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_A_R: emits the code for a SIMD instruction that takes a register operand, a GenTreeIndir address,
-//                       another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    indir     -- The GenTreeIndir used for the memory address
-//
-void emitter::emitIns_SIMD_R_R_A_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir)
-{
-    if (UseVEXEncoding())
-    {
-        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
-
-        // convert SSE encoding of SSE4.1 instructions to VEX encoding
-        switch (ins)
-        {
-            case INS_blendvps:
-            {
-                ins = INS_vblendvps;
-                break;
-            }
-
-            case INS_blendvpd:
-            {
-                ins = INS_vblendvpd;
-                break;
-            }
-
-            case INS_pblendvb:
-            {
-                ins = INS_vpblendvb;
-                break;
-            }
-
-            default:
-            {
-                break;
-            }
-        }
-
-        emitIns_R_R_A_R(ins, attr, targetReg, op1Reg, op3Reg, indir);
-    }
-    else
-    {
-        assert(isSse41Blendv(ins));
-
-        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-        if (op3Reg != REG_XMM0)
-        {
-            // Ensure we aren't overwriting op1
-            assert(op1Reg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
-        }
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
-            assert(targetReg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-
-        emitIns_R_A(ins, attr, targetReg, indir);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_AR_R: emits the code for a SIMD instruction that takes a register operand, a base memory
-//                        register, another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operands
-//    op3Reg    -- The register of the third operand
-//    base      -- The base register used for the memory address
-//
-void emitter::emitIns_SIMD_R_R_AR_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base)
-{
-    if (UseVEXEncoding())
-    {
-        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
-
-        // convert SSE encoding of SSE4.1 instructions to VEX encoding
-        switch (ins)
-        {
-            case INS_blendvps:
-            {
-                ins = INS_vblendvps;
-                break;
-            }
-
-            case INS_blendvpd:
-            {
-                ins = INS_vblendvpd;
-                break;
-            }
-
-            case INS_pblendvb:
-            {
-                ins = INS_vpblendvb;
-                break;
-            }
-
-            default:
-            {
-                break;
-            }
-        }
-
-        emitIns_R_R_AR_R(ins, attr, targetReg, op1Reg, op3Reg, base, 0);
-    }
-    else
-    {
-        assert(isSse41Blendv(ins));
-
-        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-        if (op3Reg != REG_XMM0)
-        {
-            // Ensure we aren't overwriting op1
-            assert(op1Reg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
-        }
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
-            assert(targetReg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-
-        emitIns_R_AR(ins, attr, targetReg, base, 0);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_C_R: emits the code for a SIMD instruction that takes a register operand, a field handle +
-//                       offset,  another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    fldHnd    -- The CORINFO_FIELD_HANDLE used for the memory address
-//    offs      -- The offset added to the memory address from fldHnd
-//
-void emitter::emitIns_SIMD_R_R_C_R(instruction          ins,
-                                   emitAttr             attr,
-                                   regNumber            targetReg,
-                                   regNumber            op1Reg,
-                                   regNumber            op3Reg,
-                                   CORINFO_FIELD_HANDLE fldHnd,
-                                   int                  offs)
-{
-    if (UseVEXEncoding())
-    {
-        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
-
-        // convert SSE encoding of SSE4.1 instructions to VEX encoding
-        switch (ins)
-        {
-            case INS_blendvps:
-            {
-                ins = INS_vblendvps;
-                break;
-            }
-
-            case INS_blendvpd:
-            {
-                ins = INS_vblendvpd;
-                break;
-            }
-
-            case INS_pblendvb:
-            {
-                ins = INS_vpblendvb;
-                break;
-            }
-
-            default:
-            {
-                break;
-            }
-        }
-
-        emitIns_R_R_C_R(ins, attr, targetReg, op1Reg, op3Reg, fldHnd, offs);
-    }
-    else
-    {
-        assert(isSse41Blendv(ins));
-
-        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-        if (op3Reg != REG_XMM0)
-        {
-            // Ensure we aren't overwriting op1
-            assert(op1Reg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
-        }
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
-            assert(targetReg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-
-        emitIns_R_C(ins, attr, targetReg, fldHnd, offs);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitIns_SIMD_R_R_S_R: emits the code for a SIMD instruction that takes a register operand, a variable index +
-//                       offset, another register operand, and that returns a value in register
-//
-// Arguments:
-//    ins       -- The instruction being emitted
-//    attr      -- The emit attribute
-//    targetReg -- The target register
-//    op1Reg    -- The register of the first operand
-//    op3Reg    -- The register of the third operand
-//    varx      -- The variable index used for the memory address
-//    offs      -- The offset added to the memory address from varx
-//
-void emitter::emitIns_SIMD_R_R_S_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs)
-{
-    if (UseVEXEncoding())
-    {
-        assert(isAvxBlendv(ins) || isSse41Blendv(ins));
-
-        // convert SSE encoding of SSE4.1 instructions to VEX encoding
-        switch (ins)
-        {
-            case INS_blendvps:
-            {
-                ins = INS_vblendvps;
-                break;
-            }
-
-            case INS_blendvpd:
-            {
-                ins = INS_vblendvpd;
-                break;
-            }
-
-            case INS_pblendvb:
-            {
-                ins = INS_vpblendvb;
-                break;
-            }
-
-            default:
-            {
-                break;
-            }
-        }
-
-        emitIns_R_R_S_R(ins, attr, targetReg, op1Reg, op3Reg, varx, offs);
-    }
-    else
-    {
-        assert(isSse41Blendv(ins));
-
-        // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-        if (op3Reg != REG_XMM0)
-        {
-            // Ensure we aren't overwriting op1
-            assert(op1Reg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, REG_XMM0, op3Reg);
-        }
-        if (op1Reg != targetReg)
-        {
-            // Ensure we aren't overwriting op3 (which should be REG_XMM0)
-            assert(targetReg != REG_XMM0);
-
-            emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
-        }
-
-        emitIns_R_S(ins, attr, targetReg, varx, offs);
-    }
-}
-#endif // FEATURE_HW_INTRINSICS
-
-/*****************************************************************************
- *
- *  The following add instructions referencing stack-based local variables.
- */
-
-void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstr(attr);
-    insFormat      fmt = emitInsModeFormat(ins, IF_SRD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-    sz = emitInsSizeSV(id, insCodeMR(ins), varx, offs);
-    id->idCodeSize(sz);
-
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-    dispIns(id);
-    emitCurIGsize += sz;
-
-    emitAdjustStackDepthPushPop(ins);
-}
-
-void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs)
-{
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstr(attr);
-    insFormat      fmt = emitInsModeFormat(ins, IF_SRD_RRD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(ireg);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-    sz = emitInsSizeSV(id, insCodeMR(ins), varx, offs);
-
-#ifdef TARGET_X86
-    if (attr == EA_1BYTE)
-    {
-        assert(isByteReg(ireg));
-    }
-#endif
-
-    id->idCodeSize(sz);
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs)
-{
-    emitAttr size = EA_SIZE(attr);
-    noway_assert(emitVerifyEncodable(ins, size, ireg));
-
-    UNATIVE_OFFSET sz;
-    instrDesc*     id  = emitNewInstr(attr);
-    insFormat      fmt = emitInsModeFormat(ins, IF_RRD_SRD);
-
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idReg1(ireg);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-    sz = emitInsSizeSV(id, insCodeRM(ins), varx, offs);
-    id->idCodeSize(sz);
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-void emitter::emitIns_S_I(instruction ins, emitAttr attr, int varx, int offs, int val)
-{
-#ifdef TARGET_AMD64
-    // mov reg, imm64 is the only opcode which takes a full 8 byte immediate
-    // all other opcodes take a sign-extended 4-byte immediate
-    noway_assert(EA_SIZE(attr) < EA_8BYTE || !EA_IS_CNS_RELOC(attr));
-#endif
-
-    insFormat fmt;
-
-    switch (ins)
-    {
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            assert(val != 1);
-            fmt = IF_SRW_SHF;
-            val &= 0x7F;
-            break;
-
-        default:
-            fmt = emitInsModeFormat(ins, IF_SRD_CNS);
-            break;
-    }
-
-    instrDesc* id = emitNewInstrCns(attr, val);
-    id->idIns(ins);
-    id->idInsFmt(fmt);
-    id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
-
-    UNATIVE_OFFSET sz = emitInsSizeSV(id, insCodeMI(ins), varx, offs, val);
-    id->idCodeSize(sz);
-#ifdef DEBUG
-    id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
-#endif
-    dispIns(id);
-    emitCurIGsize += sz;
-}
-
-/*****************************************************************************
- *
- *  Record that a jump instruction uses the short encoding
- *
- */
-void emitter::emitSetShortJump(instrDescJmp* id)
-{
-    if (id->idjKeepLong)
-    {
-        return;
-    }
-
-    id->idjShort = true;
-}
-
-/*****************************************************************************
- *
- *  Add a jmp instruction.
- *  When dst is NULL, instrCount specifies number of instructions
- *       to jump: positive is forward, negative is backward.
- */
-
-void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 */)
-{
-    assert(false);
-
-}
-
-#if !FEATURE_FIXED_OUT_ARGS
-
-//------------------------------------------------------------------------
-// emitAdjustStackDepthPushPop: Adjust the current and maximum stack depth.
-//
-// Arguments:
-//    ins - the instruction. Only INS_push and INS_pop adjust the stack depth.
-//
-// Notes:
-//    1. Alters emitCurStackLvl and possibly emitMaxStackDepth.
-//    2. emitCntStackDepth must be set (0 in prolog/epilog, one DWORD elsewhere)
-//
-void emitter::emitAdjustStackDepthPushPop(instruction ins)
-{
-    if (ins == INS_push)
-    {
-        emitCurStackLvl += emitCntStackDepth;
-
-        if (emitMaxStackDepth < emitCurStackLvl)
-        {
-            JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
-            emitMaxStackDepth = emitCurStackLvl;
-        }
-    }
-    else if (ins == INS_pop)
-    {
-        emitCurStackLvl -= emitCntStackDepth;
-        assert((int)emitCurStackLvl >= 0);
-    }
-}
-
-//------------------------------------------------------------------------
-// emitAdjustStackDepth: Adjust the current and maximum stack depth.
-//
-// Arguments:
-//    ins - the instruction. Only INS_add and INS_sub adjust the stack depth.
-//          It is assumed that the add/sub is on the stack pointer.
-//    val - the number of bytes to add to or subtract from the stack pointer.
-//
-// Notes:
-//    1. Alters emitCurStackLvl and possibly emitMaxStackDepth.
-//    2. emitCntStackDepth must be set (0 in prolog/epilog, one DWORD elsewhere)
-//
-void emitter::emitAdjustStackDepth(instruction ins, ssize_t val)
-{
-    // If we're in the prolog or epilog, or otherwise not tracking the stack depth, just return.
-    if (emitCntStackDepth == 0)
-        return;
-
-    if (ins == INS_sub)
-    {
-        S_UINT32 newStackLvl(emitCurStackLvl);
-        newStackLvl += S_UINT32(val);
-        noway_assert(!newStackLvl.IsOverflow());
-
-        emitCurStackLvl = newStackLvl.Value();
-
-        if (emitMaxStackDepth < emitCurStackLvl)
-        {
-            JITDUMP("Upping emitMaxStackDepth from %d to %d\n", emitMaxStackDepth, emitCurStackLvl);
-            emitMaxStackDepth = emitCurStackLvl;
-        }
-    }
-    else if (ins == INS_add)
-    {
-        S_UINT32 newStackLvl = S_UINT32(emitCurStackLvl) - S_UINT32(val);
-        noway_assert(!newStackLvl.IsOverflow());
-
-        emitCurStackLvl = newStackLvl.Value();
-    }
-}
-
-#endif // EMIT_TRACK_STACK_DEPTH
-
-/*****************************************************************************
- *
- *  Add a call instruction (direct or indirect).
- *      argSize<0 means that the caller will pop the arguments
- *
- * The other arguments are interpreted depending on callType as shown:
- * Unless otherwise specified, ireg,xreg,xmul,disp should have default values.
- *
- * EC_FUNC_TOKEN       : addr is the method address
- * EC_FUNC_TOKEN_INDIR : addr is the indirect method address
- * EC_FUNC_ADDR        : addr is the absolute address of the function
- * EC_FUNC_VIRTUAL     : "call [ireg+disp]"
- *
- * If callType is one of these emitCallTypes, addr has to be NULL.
- * EC_INDIR_R          : "call ireg".
- * EC_INDIR_SR         : "call lcl<disp>" (eg. call [ebp-8]).
- * EC_INDIR_C          : "call clsVar<disp>" (eg. call [clsVarAddr])
- * EC_INDIR_ARD        : "call [ireg+xreg*xmul+disp]"
- *
- */
-
-// clang-format off
-void emitter::emitIns_Call(EmitCallType          callType,
-                           CORINFO_METHOD_HANDLE methHnd,
-                           INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) // used to report call sites to the EE
-                           void*                 addr,
-                           ssize_t               argSize,
-                           emitAttr              retSize
-                           MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
-                           VARSET_VALARG_TP      ptrVars,
-                           regMaskTP             gcrefRegs,
-                           regMaskTP             byrefRegs,
-                           IL_OFFSETX            ilOffset, // = BAD_IL_OFFSET
-                           regNumber             ireg,     // = REG_NA
-                           regNumber             xreg,     // = REG_NA
-                           unsigned              xmul,     // = 0
-                           ssize_t               disp,     // = 0
-                           bool                  isJump)   // = false
-// clang-format on
-{
-    /* Sanity check the arguments depending on callType */
-
-    assert(callType < EC_COUNT);
-    assert((callType != EC_FUNC_TOKEN && callType != EC_FUNC_TOKEN_INDIR && callType != EC_FUNC_ADDR) ||
-           (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp == 0));
-    assert(callType != EC_FUNC_VIRTUAL || (ireg < REG_COUNT && xreg == REG_NA && xmul == 0));
-    assert(callType < EC_INDIR_R || callType == EC_INDIR_ARD || callType == EC_INDIR_C || addr == nullptr);
-    assert(callType != EC_INDIR_R || (ireg < REG_COUNT && xreg == REG_NA && xmul == 0 && disp == 0));
-    assert(callType != EC_INDIR_SR ||
-           (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp < (int)emitComp->lvaCount));
-    assert(callType != EC_INDIR_C || (ireg == REG_NA && xreg == REG_NA && xmul == 0 && disp != 0));
-
-    // Our stack level should be always greater than the bytes of arguments we push. Just
-    // a sanity test.
-    assert((unsigned)abs((signed)argSize) <= codeGen->genStackLevel);
-
-    // Trim out any callee-trashed registers from the live set.
-    regMaskTP savedSet = emitGetGCRegsSavedOrModified(methHnd);
-    gcrefRegs &= savedSet;
-    byrefRegs &= savedSet;
-
-#ifdef DEBUG
-    if (EMIT_GC_VERBOSE)
-    {
-        printf("\t\t\t\t\t\t\tCall: GCvars=%s ", VarSetOps::ToString(emitComp, ptrVars));
-        dumpConvertedVarSet(emitComp, ptrVars);
-        printf(", gcrefRegs=");
-        printRegMaskInt(gcrefRegs);
-        emitDispRegSet(gcrefRegs);
-        printf(", byrefRegs=");
-        printRegMaskInt(byrefRegs);
-        emitDispRegSet(byrefRegs);
-        printf("\n");
-    }
-#endif
-
-    /* Managed RetVal: emit sequence point for the call */
-    if (emitComp->opts.compDbgInfo && ilOffset != BAD_IL_OFFSET)
-    {
-        codeGen->genIPmappingAdd(ilOffset, false);
-    }
-
-    /*
-        We need to allocate the appropriate instruction descriptor based
-        on whether this is a direct/indirect call, and whether we need to
-        record an updated set of live GC variables.
-
-        The stats for a ton of classes is as follows:
-
-            Direct call w/o  GC vars        220,216
-            Indir. call w/o  GC vars        144,781
-
-            Direct call with GC vars          9,440
-            Indir. call with GC vars          5,768
-     */
-
-    instrDesc* id;
-
-    assert(argSize % REGSIZE_BYTES == 0);
-    int argCnt = (int)(argSize / (int)REGSIZE_BYTES); // we need a signed-divide
-
-    if (callType >= EC_FUNC_VIRTUAL)
-    {
-        /* Indirect call, virtual calls */
-
-        assert(callType == EC_FUNC_VIRTUAL || callType == EC_INDIR_R || callType == EC_INDIR_SR ||
-               callType == EC_INDIR_C || callType == EC_INDIR_ARD);
-
-        id = emitNewInstrCallInd(argCnt, disp, ptrVars, gcrefRegs, byrefRegs,
-                                 retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize));
-    }
-    else
-    {
-        // Helper/static/nonvirtual/function calls (direct or through handle),
-        // and calls to an absolute addr.
-
-        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_TOKEN_INDIR || callType == EC_FUNC_ADDR);
-
-        id = emitNewInstrCallDir(argCnt, ptrVars, gcrefRegs, byrefRegs,
-                                 retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(secondRetSize));
-    }
-
-    /* Update the emitter's live GC ref sets */
-
-    VarSetOps::Assign(emitComp, emitThisGCrefVars, ptrVars);
-    emitThisGCrefRegs = gcrefRegs;
-    emitThisByrefRegs = byrefRegs;
-
-    /* Set the instruction - special case jumping a function */
-    instruction ins = INS_call;
-
-    if (isJump)
-    {
-        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_TOKEN_INDIR || callType == EC_INDIR_ARD);
-        if (callType == EC_FUNC_TOKEN)
-        {
-            ins = INS_l_jmp;
-        }
-        else
-        {
-            ins = INS_i_jmp;
-        }
-    }
-    id->idIns(ins);
-
-    id->idSetIsNoGC(emitNoGChelper(methHnd));
-
-    UNATIVE_OFFSET sz;
-
-    // Record the address: method, indirection, or funcptr
-    if (callType >= EC_FUNC_VIRTUAL)
-    {
-        // This is an indirect call (either a virtual call or func ptr call)
-
-        switch (callType)
-        {
-            case EC_INDIR_C:
-                // Indirect call using an absolute code address.
-                // Must be marked as relocatable and is done at the
-                // branch target location.
-                goto CALL_ADDR_MODE;
-
-            case EC_INDIR_R: // the address is in a register
-
-                id->idSetIsCallRegPtr();
-
-                FALLTHROUGH;
-
-            case EC_INDIR_ARD: // the address is an indirection
-
-                goto CALL_ADDR_MODE;
-
-            case EC_INDIR_SR: // the address is in a lcl var
-
-                id->idInsFmt(IF_SRD);
-                // disp is really a lclVarNum
-                noway_assert((unsigned)disp == (size_t)disp);
-                id->idAddr()->iiaLclVar.initLclVarAddr((unsigned)disp, 0);
-                sz = emitInsSizeSV(id, insCodeMR(INS_call), (unsigned)disp, 0);
-
-                break;
-
-            case EC_FUNC_VIRTUAL:
-
-            CALL_ADDR_MODE:
-
-                // fall-through
-
-                // The function is "ireg" if id->idIsCallRegPtr(),
-                // else [ireg+xmul*xreg+disp]
-
-                id->idInsFmt(IF_ARD);
-
-                id->idAddr()->iiaAddrMode.amBaseReg = ireg;
-                id->idAddr()->iiaAddrMode.amIndxReg = xreg;
-                id->idAddr()->iiaAddrMode.amScale   = xmul ? emitEncodeScale(xmul) : emitter::OPSZ1;
-
-                sz = emitInsSizeAM(id, insCodeMR(INS_call));
-
-                if (ireg == REG_NA && xreg == REG_NA)
-                {
-                    if (codeGen->genCodeIndirAddrNeedsReloc(disp))
-                    {
-                        id->idSetIsDspReloc();
-                    }
-#ifdef TARGET_AMD64
-                    else
-                    {
-                        // An absolute indir address that doesn't need reloc should fit within 32-bits
-                        // to be encoded as offset relative to zero.  This addr mode requires an extra
-                        // SIB byte
-                        noway_assert(static_cast<int>(reinterpret_cast<intptr_t>(addr)) == (size_t)addr);
-                        sz++;
-                    }
-#endif // TARGET_AMD64
-                }
-
-                break;
-
-            default:
-                NO_WAY("unexpected instruction");
-                break;
-        }
-    }
-    else if (callType == EC_FUNC_TOKEN_INDIR)
-    {
-        /* "call [method_addr]" */
-
-        assert(addr != nullptr);
-
-        id->idInsFmt(IF_METHPTR);
-        id->idAddr()->iiaAddr = (BYTE*)addr;
-        sz                    = 6;
-
-        // Since this is an indirect call through a pointer and we don't
-        // currently pass in emitAttr into this function, we query codegen
-        // whether addr needs a reloc.
-        if (codeGen->genCodeIndirAddrNeedsReloc((size_t)addr))
-        {
-            id->idSetIsDspReloc();
-        }
-#ifdef TARGET_AMD64
-        else
-        {
-            // An absolute indir address that doesn't need reloc should fit within 32-bits
-            // to be encoded as offset relative to zero.  This addr mode requires an extra
-            // SIB byte
-            noway_assert(static_cast<int>(reinterpret_cast<intptr_t>(addr)) == (size_t)addr);
-            sz++;
-        }
-#endif // TARGET_AMD64
-    }
-    else
-    {
-        /* This is a simple direct call: "call helper/method/addr" */
-
-        assert(callType == EC_FUNC_TOKEN || callType == EC_FUNC_ADDR);
-
-        assert(addr != nullptr);
-
-        id->idInsFmt(IF_METHOD);
-        sz = 5;
-
-        id->idAddr()->iiaAddr = (BYTE*)addr;
-
-        if (callType == EC_FUNC_ADDR)
-        {
-            id->idSetIsCallAddr();
-        }
-
-        // Direct call to a method and no addr indirection is needed.
-        if (codeGen->genCodeAddrNeedsReloc((size_t)addr))
-        {
-            id->idSetIsDspReloc();
-        }
-    }
-
-#ifdef DEBUG
-    if (emitComp->verbose && 0)
-    {
-        if (id->idIsLargeCall())
-        {
-            if (callType >= EC_FUNC_VIRTUAL)
-            {
-                printf("[%02u] Rec call GC vars = %s\n", id->idDebugOnlyInfo()->idNum,
-                       VarSetOps::ToString(emitComp, ((instrDescCGCA*)id)->idcGCvars));
-            }
-            else
-            {
-                printf("[%02u] Rec call GC vars = %s\n", id->idDebugOnlyInfo()->idNum,
-                       VarSetOps::ToString(emitComp, ((instrDescCGCA*)id)->idcGCvars));
-            }
-        }
-    }
-
-    id->idDebugOnlyInfo()->idMemCookie = (size_t)methHnd; // method token
-    id->idDebugOnlyInfo()->idCallSig   = sigInfo;
-#endif // DEBUG
-
-#ifdef LATE_DISASM
-    if (addr != nullptr)
-    {
-        codeGen->getDisAssembler().disSetMethod((size_t)addr, methHnd);
-    }
-#endif // LATE_DISASM
-
-    id->idCodeSize(sz);
-
-    dispIns(id);
-    emitCurIGsize += sz;
-
-#if !FEATURE_FIXED_OUT_ARGS
-
-    /* The call will pop the arguments */
-
-    if (emitCntStackDepth && argSize > 0)
-    {
-        noway_assert((ssize_t)emitCurStackLvl >= argSize);
-        emitCurStackLvl -= (int)argSize;
-        assert((int)emitCurStackLvl >= 0);
-    }
-
-#endif // !FEATURE_FIXED_OUT_ARGS
-}
-
-#ifdef DEBUG
-/*****************************************************************************
- *
- *  The following called for each recorded instruction -- use for debugging.
- */
-void emitter::emitInsSanityCheck(instrDesc* id)
-{
-    // make certain you only try to put relocs on things that can have them.
-    ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()];
-    if ((idOp == ID_OP_SCNS) && id->idIsLargeCns())
-    {
-        idOp = ID_OP_CNS;
-    }
-
-    if (id->idIsDspReloc())
-    {
-        assert(idOp == ID_OP_NONE || idOp == ID_OP_AMD || idOp == ID_OP_DSP || idOp == ID_OP_DSP_CNS ||
-               idOp == ID_OP_AMD_CNS || idOp == ID_OP_SPEC || idOp == ID_OP_CALL || idOp == ID_OP_JMP ||
-               idOp == ID_OP_LBL);
-    }
-
-    if (id->idIsCnsReloc())
-    {
-        assert(idOp == ID_OP_CNS || idOp == ID_OP_AMD_CNS || idOp == ID_OP_DSP_CNS || idOp == ID_OP_SPEC ||
-               idOp == ID_OP_CALL || idOp == ID_OP_JMP);
-    }
-}
-#endif
-
-/*****************************************************************************
- *
- *  Return the allocated size (in bytes) of the given instruction descriptor.
- */
-
-size_t emitter::emitSizeOfInsDsc(instrDesc* id)
-{
-    if (emitIsScnsInsDsc(id))
-    {
-        return SMALL_IDSC_SIZE;
-    }
-
-    assert((unsigned)id->idInsFmt() < emitFmtCount);
-
-    ID_OPS idOp = (ID_OPS)emitFmtToOps[id->idInsFmt()];
-
-    // An INS_call instruction may use a "fat" direct/indirect call descriptor
-    // except for a local call to a label (i.e. call to a finally)
-    // Only ID_OP_CALL and ID_OP_SPEC check for this, so we enforce that the
-    //  INS_call instruction always uses one of these idOps
-
-    if (id->idIns() == INS_call)
-    {
-        assert(idOp == ID_OP_CALL || // is a direct   call
-               idOp == ID_OP_SPEC || // is a indirect call
-               idOp == ID_OP_JMP);   // is a local call to finally clause
-    }
-
-    switch (idOp)
-    {
-        case ID_OP_NONE:
-#if FEATURE_LOOP_ALIGN
-            if (id->idIns() == INS_align)
-            {
-                return sizeof(instrDescAlign);
-            }
-#endif
-            break;
-
-        case ID_OP_LBL:
-            return sizeof(instrDescLbl);
-
-        case ID_OP_JMP:
-            return sizeof(instrDescJmp);
-
-        case ID_OP_CALL:
-        case ID_OP_SPEC:
-            if (id->idIsLargeCall())
-            {
-                /* Must be a "fat" indirect call descriptor */
-                return sizeof(instrDescCGCA);
-            }
-
-            FALLTHROUGH;
-
-        case ID_OP_SCNS:
-        case ID_OP_CNS:
-        case ID_OP_DSP:
-        case ID_OP_DSP_CNS:
-            if (id->idIsLargeCns())
-            {
-                if (id->idIsLargeDsp())
-                {
-                    return sizeof(instrDescCnsDsp);
-                }
-                else
-                {
-                    return sizeof(instrDescCns);
-                }
-            }
-            else
-            {
-                if (id->idIsLargeDsp())
-                {
-                    return sizeof(instrDescDsp);
-                }
-                else
-                {
-                    return sizeof(instrDesc);
-                }
-            }
-        case ID_OP_AMD:
-        case ID_OP_AMD_CNS:
-            if (id->idIsLargeCns())
-            {
-                if (id->idIsLargeDsp())
-                {
-                    return sizeof(instrDescCnsAmd);
-                }
-                else
-                {
-                    return sizeof(instrDescCns);
-                }
-            }
-            else
-            {
-                if (id->idIsLargeDsp())
-                {
-                    return sizeof(instrDescAmd);
-                }
-                else
-                {
-                    return sizeof(instrDesc);
-                }
-            }
-
-        default:
-            NO_WAY("unexpected instruction descriptor format");
-            break;
-    }
-
-    return sizeof(instrDesc);
-}
-
-/*****************************************************************************/
-#ifdef DEBUG
-/*****************************************************************************
- *
- *  Return a string that represents the given register.
- */
-
-const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName)
-{
-    static char          rb[2][128];
-    static unsigned char rbc = 0;
-
-    const char* rn = emitComp->compRegVarName(reg, varName);
-
-#ifdef TARGET_AMD64
-    char suffix = '\0';
-
-    switch (EA_SIZE(attr))
-    {
-        case EA_32BYTE:
-            return emitYMMregName(reg);
-
-        case EA_16BYTE:
-            return emitXMMregName(reg);
-
-        case EA_8BYTE:
-            if ((REG_XMM0 <= reg) && (reg <= REG_XMM15))
-            {
-                return emitXMMregName(reg);
-            }
-            break;
-
-        case EA_4BYTE:
-            if ((REG_XMM0 <= reg) && (reg <= REG_XMM15))
-            {
-                return emitXMMregName(reg);
-            }
-
-            if (reg > REG_R15)
-            {
-                break;
-            }
-
-            if (reg > REG_RDI)
-            {
-                suffix = 'd';
-                goto APPEND_SUFFIX;
-            }
-            rbc        = (rbc + 1) % 2;
-            rb[rbc][0] = 'e';
-            rb[rbc][1] = rn[1];
-            rb[rbc][2] = rn[2];
-            rb[rbc][3] = 0;
-            rn         = rb[rbc];
-            break;
-
-        case EA_2BYTE:
-            if (reg > REG_RDI)
-            {
-                suffix = 'w';
-                goto APPEND_SUFFIX;
-            }
-            rn++;
-            break;
-
-        case EA_1BYTE:
-            if (reg > REG_RDI)
-            {
-                suffix = 'b';
-            APPEND_SUFFIX:
-                rbc        = (rbc + 1) % 2;
-                rb[rbc][0] = rn[0];
-                rb[rbc][1] = rn[1];
-                if (rn[2])
-                {
-                    assert(rn[3] == 0);
-                    rb[rbc][2] = rn[2];
-                    rb[rbc][3] = suffix;
-                    rb[rbc][4] = 0;
-                }
-                else
-                {
-                    rb[rbc][2] = suffix;
-                    rb[rbc][3] = 0;
-                }
-            }
-            else
-            {
-                rbc        = (rbc + 1) % 2;
-                rb[rbc][0] = rn[1];
-                if (reg < 4)
-                {
-                    rb[rbc][1] = 'l';
-                    rb[rbc][2] = 0;
-                }
-                else
-                {
-                    rb[rbc][1] = rn[2];
-                    rb[rbc][2] = 'l';
-                    rb[rbc][3] = 0;
-                }
-            }
-
-            rn = rb[rbc];
-            break;
-
-        default:
-            break;
-    }
-#endif // TARGET_AMD64
-
-#ifdef TARGET_X86
-    assert(strlen(rn) >= 3);
-
-    switch (EA_SIZE(attr))
-    {
-        case EA_32BYTE:
-            return emitYMMregName(reg);
-
-        case EA_16BYTE:
-            return emitXMMregName(reg);
-
-        case EA_8BYTE:
-            if ((REG_XMM0 <= reg) && (reg <= REG_XMM7))
-            {
-                return emitXMMregName(reg);
-            }
-            break;
-
-        case EA_4BYTE:
-            if ((REG_XMM0 <= reg) && (reg <= REG_XMM7))
-            {
-                return emitXMMregName(reg);
-            }
-            break;
-
-        case EA_2BYTE:
-            rn++;
-            break;
-
-        case EA_1BYTE:
-            rbc        = (rbc + 1) % 2;
-            rb[rbc][0] = rn[1];
-            rb[rbc][1] = 'l';
-            strcpy_s(&rb[rbc][2], sizeof(rb[0]) - 2, rn + 3);
-
-            rn = rb[rbc];
-            break;
-
-        default:
-            break;
-    }
-#endif // TARGET_X86
-
-#if 0
-    // The following is useful if you want register names to be tagged with * or ^ representing gcref or byref, respectively,
-    // however it's possibly not interesting most of the time.
-    if (EA_IS_GCREF(attr) || EA_IS_BYREF(attr))
-    {
-        if (rn != rb[rbc])
-        {
-            rbc = (rbc+1)%2;
-            strcpy_s(rb[rbc], sizeof(rb[rbc]), rn);
-            rn = rb[rbc];
-        }
-
-        if (EA_IS_GCREF(attr))
-        {
-            strcat_s(rb[rbc], sizeof(rb[rbc]), "*");
-        }
-        else if (EA_IS_BYREF(attr))
-        {
-            strcat_s(rb[rbc], sizeof(rb[rbc]), "^");
-        }
-    }
-#endif // 0
-
-    return rn;
-}
-
-/*****************************************************************************
- *
- *  Return a string that represents the given FP register.
- */
-
-const char* emitter::emitFPregName(unsigned reg, bool varName)
-{
-    assert(reg < REG_COUNT);
-
-    return emitComp->compFPregVarName((regNumber)(reg), varName);
-}
-
-/*****************************************************************************
- *
- *  Return a string that represents the given XMM register.
- */
-
-const char* emitter::emitXMMregName(unsigned reg)
-{
-    static const char* const regNames[] = {
-#define REGDEF(name, rnum, mask, sname) "x" sname,
-#include "register.h"
-    };
-
-    assert(reg < REG_COUNT);
-    assert(reg < _countof(regNames));
-
-    return regNames[reg];
-}
-
-/*****************************************************************************
- *
- *  Return a string that represents the given YMM register.
- */
-
-const char* emitter::emitYMMregName(unsigned reg)
-{
-    static const char* const regNames[] = {
-#define REGDEF(name, rnum, mask, sname) "y" sname,
-#include "register.h"
-    };
-
-    assert(reg < REG_COUNT);
-    assert(reg < _countof(regNames));
-
-    return regNames[reg];
-}
-
-/*****************************************************************************
- *
- *  Display a static data member reference.
- */
-
-void emitter::emitDispClsVar(CORINFO_FIELD_HANDLE fldHnd, ssize_t offs, bool reloc /* = false */)
-{
-    int doffs;
-
-    /* Filter out the special case of fs:[offs] */
-
-    // Munge any pointers if we want diff-able disassembly
-    if (emitComp->opts.disDiffable)
-    {
-        ssize_t top12bits = (offs >> 20);
-        if ((top12bits != 0) && (top12bits != -1))
-        {
-            offs = 0xD1FFAB1E;
-        }
-    }
-
-    if (fldHnd == FLD_GLOBAL_FS)
-    {
-        printf("FS:[0x%04X]", offs);
-        return;
-    }
-
-    if (fldHnd == FLD_GLOBAL_DS)
-    {
-        printf("[0x%04X]", offs);
-        return;
-    }
-
-    printf("[");
-
-    doffs = Compiler::eeGetJitDataOffs(fldHnd);
-
-    if (reloc)
-    {
-        printf("reloc ");
-    }
-
-    if (doffs >= 0)
-    {
-        if (doffs & 1)
-        {
-            printf("@CNS%02u", doffs - 1);
-        }
-        else
-        {
-            printf("@RWD%02u", doffs);
-        }
-
-        if (offs)
-        {
-            printf("%+Id", offs);
-        }
-    }
-    else
-    {
-        printf("classVar[%#x]", emitComp->dspPtr(fldHnd));
-
-        if (offs)
-        {
-            printf("%+Id", offs);
-        }
-    }
-
-    printf("]");
-
-    if (emitComp->opts.varNames && offs < 0)
-    {
-        printf("'%s", emitComp->eeGetFieldName(fldHnd));
-        if (offs)
-        {
-            printf("%+Id", offs);
-        }
-        printf("'");
-    }
-}
-
-/*****************************************************************************
- *
- *  Display a stack frame reference.
- */
-
-void emitter::emitDispFrameRef(int varx, int disp, int offs, bool asmfm)
-{
-    int  addr;
-    bool bEBP;
-
-    printf("[");
-
-    if (!asmfm || emitComp->lvaDoneFrameLayout == Compiler::NO_FRAME_LAYOUT)
-    {
-        if (varx < 0)
-        {
-            printf("TEMP_%02u", -varx);
-        }
-        else
-        {
-            printf("V%02u", +varx);
-        }
-
-        if (disp < 0)
-        {
-            printf("-0x%X", -disp);
-        }
-        else if (disp > 0)
-        {
-            printf("+0x%X", +disp);
-        }
-    }
-
-    if (emitComp->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT)
-    {
-        if (!asmfm)
-        {
-            printf(" ");
-        }
-
-        addr = emitComp->lvaFrameAddress(varx, &bEBP) + disp;
-
-        if (bEBP)
-        {
-            printf(STR_FPBASE);
-
-            if (addr < 0)
-            {
-                printf("-%02XH", -addr);
-            }
-            else if (addr > 0)
-            {
-                printf("+%02XH", addr);
-            }
-        }
-        else
-        {
-            /* Adjust the offset by amount currently pushed on the stack */
-
-            printf(STR_SPBASE);
-
-            if (addr < 0)
-            {
-                printf("-%02XH", -addr);
-            }
-            else if (addr > 0)
-            {
-                printf("+%02XH", addr);
-            }
-
-#if !FEATURE_FIXED_OUT_ARGS
-
-            if (emitCurStackLvl)
-                printf("+%02XH", emitCurStackLvl);
-
-#endif // !FEATURE_FIXED_OUT_ARGS
-        }
-    }
-
-    printf("]");
-
-    if (varx >= 0 && emitComp->opts.varNames)
-    {
-        LclVarDsc*  varDsc;
-        const char* varName;
-
-        assert((unsigned)varx < emitComp->lvaCount);
-        varDsc  = emitComp->lvaTable + varx;
-        varName = emitComp->compLocalVarName(varx, offs);
-
-        if (varName)
-        {
-            printf("'%s", varName);
-
-            if (disp < 0)
-            {
-                printf("-%d", -disp);
-            }
-            else if (disp > 0)
-            {
-                printf("+%d", +disp);
-            }
-
-            printf("'");
-        }
-    }
-}
-
-/*****************************************************************************
- *
- *  Display an reloc value
- *  If we are formatting for an assembly listing don't print the hex value
- *  since it will prevent us from doing assembly diffs
- */
-void emitter::emitDispReloc(ssize_t value)
-{
-    if (emitComp->opts.disAsm)
-    {
-        printf("(reloc)");
-    }
-    else
-    {
-        printf("(reloc 0x%Ix)", emitComp->dspPtr(value));
-    }
-}
-
-/*****************************************************************************
- *
- *  Display an address mode.
- */
-
-void emitter::emitDispAddrMode(instrDesc* id, bool noDetail)
-{
-    assert(false);
-}
-
-/*****************************************************************************
- *
- *  If the given instruction is a shift, display the 2nd operand.
- */
-
-void emitter::emitDispShift(instruction ins, int cnt)
-{
-    switch (ins)
-    {
-        case INS_rcl_1:
-        case INS_rcr_1:
-        case INS_rol_1:
-        case INS_ror_1:
-        case INS_shl_1:
-        case INS_shr_1:
-        case INS_sar_1:
-            printf(", 1");
-            break;
-
-        case INS_rcl:
-        case INS_rcr:
-        case INS_rol:
-        case INS_ror:
-        case INS_shl:
-        case INS_shr:
-        case INS_sar:
-            printf(", cl");
-            break;
-
-        case INS_rcl_N:
-        case INS_rcr_N:
-        case INS_rol_N:
-        case INS_ror_N:
-        case INS_shl_N:
-        case INS_shr_N:
-        case INS_sar_N:
-            printf(", %d", cnt);
-            break;
-
-        default:
-            break;
-    }
-}
-
-/*****************************************************************************
- *
- *  Display (optionally) the bytes for the instruction encoding in hex
- */
-
-void emitter::emitDispInsHex(instrDesc* id, BYTE* code, size_t sz)
-{
-    // We do not display the instruction hex if we want diff-able disassembly
-    if (!emitComp->opts.disDiffable)
-    {
-#ifdef TARGET_AMD64
-        // how many bytes per instruction we format for
-        const size_t digits = 10;
-#else // TARGET_X86
-        const size_t digits = 6;
-#endif
-        printf(" ");
-        for (unsigned i = 0; i < sz; i++)
-        {
-            printf("%02X", (*((BYTE*)(code + i))));
-        }
-
-        if (sz < digits)
-        {
-            printf("%.*s", 2 * (digits - sz), "                         ");
-        }
-    }
-}
-
-/*****************************************************************************
- *
- *  Display the given instruction.
- */
-
-void emitter::emitDispIns(
-    instrDesc* id, bool isNew, bool doffs, bool asmfm, unsigned offset, BYTE* code, size_t sz, insGroup* ig)
-{
-    assert(false);
-
-}
-
-/*****************************************************************************/
-#endif
-
-/*****************************************************************************
- *
- *  Output nBytes bytes of NOP instructions
- */
-
-//static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
-//{
-//    assert(nBytes <= 15);
-//
-//#ifndef TARGET_AMD64
-//    // TODO-X86-CQ: when VIA C3 CPU's are out of circulation, switch to the
-//    // more efficient real NOP: 0x0F 0x1F +modR/M
-//    // Also can't use AMD recommended, multiple size prefixes (i.e. 0x66 0x66 0x90 for 3 byte NOP)
-//    // because debugger and msdis don't like it, so maybe VIA doesn't either
-//    // So instead just stick to repeating single byte nops
-//
-//    switch (nBytes)
-//    {
-//        case 15:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 14:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 13:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 12:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 11:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 10:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 9:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 8:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 7:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 6:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 5:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 4:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 3:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 2:
-//            *dst++ = 0x90;
-//            FALLTHROUGH;
-//        case 1:
-//            *dst++ = 0x90;
-//            break;
-//        case 0:
-//            break;
-//    }
-//#else  // TARGET_AMD64
-//    switch (nBytes)
-//    {
-//        case 2:
-//            *dst++ = 0x66;
-//            FALLTHROUGH;
-//        case 1:
-//            *dst++ = 0x90;
-//            break;
-//        case 0:
-//            break;
-//        case 3:
-//            *dst++ = 0x0F;
-//            *dst++ = 0x1F;
-//            *dst++ = 0x00;
-//            break;
-//        case 4:
-//            *dst++ = 0x0F;
-//            *dst++ = 0x1F;
-//            *dst++ = 0x40;
-//            *dst++ = 0x00;
-//            break;
-//        case 6:
-//            *dst++ = 0x66;
-//            FALLTHROUGH;
-//        case 5:
-//            *dst++ = 0x0F;
-//            *dst++ = 0x1F;
-//            *dst++ = 0x44;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            break;
-//        case 7:
-//            *dst++ = 0x0F;
-//            *dst++ = 0x1F;
-//            *dst++ = 0x80;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            break;
-//        case 15:
-//            // More than 3 prefixes is slower than just 2 NOPs
-//            dst = emitOutputNOP(emitOutputNOP(dst, 7), 8);
-//            break;
-//        case 14:
-//            // More than 3 prefixes is slower than just 2 NOPs
-//            dst = emitOutputNOP(emitOutputNOP(dst, 7), 7);
-//            break;
-//        case 13:
-//            // More than 3 prefixes is slower than just 2 NOPs
-//            dst = emitOutputNOP(emitOutputNOP(dst, 5), 8);
-//            break;
-//        case 12:
-//            // More than 3 prefixes is slower than just 2 NOPs
-//            dst = emitOutputNOP(emitOutputNOP(dst, 4), 8);
-//            break;
-//        case 11:
-//            *dst++ = 0x66;
-//            FALLTHROUGH;
-//        case 10:
-//            *dst++ = 0x66;
-//            FALLTHROUGH;
-//        case 9:
-//            *dst++ = 0x66;
-//            FALLTHROUGH;
-//        case 8:
-//            *dst++ = 0x0F;
-//            *dst++ = 0x1F;
-//            *dst++ = 0x84;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            *dst++ = 0x00;
-//            break;
-//    }
-//#endif // TARGET_AMD64
-//
-//    return dst;
-//}
-
-//--------------------------------------------------------------------
-// emitOutputAlign: Outputs NOP to align the loop
-//
-// Arguments:
-//   ig - Current instruction group
-//   id - align instruction that holds amount of padding (NOPs) to add
-//   dst - Destination buffer
-//
-// Return Value:
-//   None.
-//
-// Notes:
-//   Amount of padding needed to align the loop is already calculated. This
-//   method extracts that information and inserts suitable NOP instructions.
-//
-BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction involving an address mode.
- */
-
-BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction involving a stack frame value.
- */
-
-BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction with a static data member (class variable).
- */
-
-BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction with one register operand.
- */
-
-BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id)
-{
-    code_t code;
-
-    instruction ins  = id->idIns();
-    regNumber   reg  = id->idReg1();
-    emitAttr    size = id->idOpSize();
-
-    // We would to update GC info correctly
-    assert(!IsSSEInstruction(ins));
-    assert(!IsAVXInstruction(ins));
-
-    // Get the 'base' opcode
-    switch (ins)
-    {
-        case INS_inc:
-        case INS_dec:
-
-#ifdef TARGET_AMD64
-            if (true)
-#else
-            if (size == EA_1BYTE)
-#endif
-            {
-                assert(INS_inc_l == INS_inc + 1);
-                assert(INS_dec_l == INS_dec + 1);
-
-                // Can't use the compact form, use the long form
-                ins = (instruction)(ins + 1);
-                if (size == EA_2BYTE)
-                {
-                    // Output a size prefix for a 16-bit operand
-                    dst += emitOutputByte(dst, 0x66);
-                }
-
-                code = insCodeRR(ins);
-                if (size != EA_1BYTE)
-                {
-                    // Set the 'w' bit to get the large version
-                    code |= 0x1;
-                }
-
-                if (TakesRexWPrefix(ins, size))
-                {
-                    code = AddRexWPrefix(ins, code);
-                }
-
-                // Register...
-                unsigned regcode = insEncodeReg012(ins, reg, size, &code);
-
-                // Output the REX prefix
-                dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-
-                dst += emitOutputWord(dst, code | (regcode << 8));
-            }
-            else
-            {
-                if (size == EA_2BYTE)
-                {
-                    // Output a size prefix for a 16-bit operand
-                    dst += emitOutputByte(dst, 0x66);
-                }
-                dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr));
-            }
-            break;
-
-        case INS_pop:
-        case INS_pop_hide:
-        case INS_push:
-        case INS_push_hide:
-
-            assert(size == EA_PTRSIZE);
-            code = insEncodeOpreg(ins, reg, size);
-
-            assert(!TakesVexPrefix(ins));
-            assert(!TakesRexWPrefix(ins, size));
-
-            // Output the REX prefix
-            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-
-            dst += emitOutputByte(dst, code);
-            break;
-
-        case INS_bswap:
-        {
-            assert(size >= EA_4BYTE && size <= EA_PTRSIZE); // 16-bit BSWAP is undefined
-
-            // The Intel instruction set reference for BSWAP states that extended registers
-            // should be enabled via REX.R, but per Vol. 2A, Sec. 2.2.1.2 (see also Figure 2-7),
-            // REX.B should instead be used if the register is encoded in the opcode byte itself.
-            // Therefore the default logic of insEncodeReg012 is correct for this case.
-
-            code = insCodeRR(ins);
-
-            if (TakesRexWPrefix(ins, size))
-            {
-                code = AddRexWPrefix(ins, code);
-            }
-
-            // Register...
-            unsigned regcode = insEncodeReg012(ins, reg, size, &code);
-
-            // Output the REX prefix
-            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-
-            dst += emitOutputWord(dst, code | (regcode << 8));
-            break;
-        }
-
-        case INS_seto:
-        case INS_setno:
-        case INS_setb:
-        case INS_setae:
-        case INS_sete:
-        case INS_setne:
-        case INS_setbe:
-        case INS_seta:
-        case INS_sets:
-        case INS_setns:
-        case INS_setp:
-        case INS_setnp:
-        case INS_setl:
-        case INS_setge:
-        case INS_setle:
-        case INS_setg:
-
-            assert(id->idGCref() == GCT_NONE);
-            assert(size == EA_1BYTE);
-
-            code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins));
-
-            // Output the REX prefix
-            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-
-            // We expect this to always be a 'big' opcode
-            assert(code & 0x00FF0000);
-
-            dst += emitOutputByte(dst, code >> 16);
-            dst += emitOutputWord(dst, code & 0x0000FFFF);
-
-            break;
-
-        case INS_mulEAX:
-        case INS_imulEAX:
-
-            // Kill off any GC refs in EAX or EDX
-            emitGCregDeadUpd(REG_EAX, dst);
-            emitGCregDeadUpd(REG_EDX, dst);
-
-            FALLTHROUGH;
-
-        default:
-
-            assert(id->idGCref() == GCT_NONE);
-
-            code = insEncodeMRreg(ins, reg, size, insCodeMR(ins));
-
-            if (size != EA_1BYTE)
-            {
-                // Set the 'w' bit to get the large version
-                code |= 0x1;
-
-                if (size == EA_2BYTE)
-                {
-                    // Output a size prefix for a 16-bit operand
-                    dst += emitOutputByte(dst, 0x66);
-                }
-            }
-
-            code = AddVexPrefixIfNeeded(ins, code, size);
-
-            if (TakesRexWPrefix(ins, size))
-            {
-                code = AddRexWPrefix(ins, code);
-            }
-
-            // Output the REX prefix
-            dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-
-            dst += emitOutputWord(dst, code);
-            break;
-    }
-
-    // Are we writing the register? if so then update the GC information
-    switch (id->idInsFmt())
-    {
-        case IF_RRD:
-            break;
-        case IF_RWR:
-            if (id->idGCref())
-            {
-                emitGCregLiveUpd(id->idGCref(), id->idReg1(), dst);
-            }
-            else
-            {
-                emitGCregDeadUpd(id->idReg1(), dst);
-            }
-            break;
-        case IF_RRW:
-        {
-#ifdef DEBUG
-            regMaskTP regMask = genRegMask(reg);
-#endif
-            if (id->idGCref())
-            {
-                // The reg must currently be holding either a gcref or a byref
-                // and the instruction must be inc or dec
-                assert(((emitThisGCrefRegs | emitThisByrefRegs) & regMask) &&
-                       (ins == INS_inc || ins == INS_dec || ins == INS_inc_l || ins == INS_dec_l));
-                assert(id->idGCref() == GCT_BYREF);
-                // Mark it as holding a GCT_BYREF
-                emitGCregLiveUpd(GCT_BYREF, id->idReg1(), dst);
-            }
-            else
-            {
-                // Can't use RRW to trash a GC ref.  It's OK for unverifiable code
-                // to trash Byrefs.
-                assert((emitThisGCrefRegs & regMask) == 0);
-            }
-        }
-        break;
-        default:
-#ifdef DEBUG
-            emitDispIns(id, false, false, false);
-#endif
-            assert(!"unexpected instruction format");
-            break;
-    }
-
-    return dst;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction with two register operands.
- */
-
-BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
-{
-    assert(false);
-    return 0;
-}
-
-BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction with a register and constant operands.
- */
-
-BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Output an instruction with a constant operand.
- */
-
-BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id)
-{
-    code_t      code;
-    instruction ins       = id->idIns();
-    emitAttr    size      = id->idOpSize();
-    ssize_t     val       = emitGetInsSC(id);
-    bool        valInByte = ((signed char)val == (target_ssize_t)val);
-
-    // We would to update GC info correctly
-    assert(!IsSSEInstruction(ins));
-    assert(!IsAVXInstruction(ins));
-
-#ifdef TARGET_AMD64
-    // all these opcodes take a sign-extended 4-byte immediate, max
-    noway_assert(size < EA_8BYTE || ((int)val == val && !id->idIsCnsReloc()));
-#endif
-
-    if (id->idIsCnsReloc())
-    {
-        valInByte = false; // relocs can't be placed in a byte
-
-        // Of these instructions only the push instruction can have reloc
-        assert(ins == INS_push || ins == INS_push_hide);
-    }
-
-    switch (ins)
-    {
-        case INS_jge:
-            assert((val >= -128) && (val <= 127));
-            dst += emitOutputByte(dst, insCode(ins));
-            dst += emitOutputByte(dst, val);
-            break;
-
-        case INS_loop:
-            assert((val >= -128) && (val <= 127));
-            dst += emitOutputByte(dst, insCodeMI(ins));
-            dst += emitOutputByte(dst, val);
-            break;
-
-        case INS_ret:
-            assert(val);
-            dst += emitOutputByte(dst, insCodeMI(ins));
-            dst += emitOutputWord(dst, val);
-            break;
-
-        case INS_push_hide:
-        case INS_push:
-            code = insCodeMI(ins);
-
-            // Does the operand fit in a byte?
-            if (valInByte)
-            {
-                dst += emitOutputByte(dst, code | 2);
-                dst += emitOutputByte(dst, val);
-            }
-            else
-            {
-                if (TakesRexWPrefix(ins, size))
-                {
-                    code = AddRexWPrefix(ins, code);
-                    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
-                }
-
-                dst += emitOutputByte(dst, code);
-                dst += emitOutputLong(dst, val);
-                if (id->idIsCnsReloc())
-                {
-                    emitRecordRelocation((void*)(dst - sizeof(INT32)), (void*)(size_t)val, IMAGE_REL_BASED_HIGHLOW);
-                }
-            }
-
-            // Did we push a GC ref value?
-            if (id->idGCref())
-            {
-#ifdef DEBUG
-                printf("UNDONE: record GCref push [cns]\n");
-#endif
-            }
-
-            break;
-
-        default:
-            assert(!"unexpected instruction");
-    }
-
-    return dst;
-}
-
-/*****************************************************************************
- *
- *  Output a local jump instruction.
- *  This function also handles non-jumps that have jump-like characteristics, like RIP-relative LEA of a label that
- *  needs to get bound to an actual address and processed by branch shortening.
- */
-
-BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
-{
-    assert(false);
-    return 0;
-}
-
-/*****************************************************************************
- *
- *  Append the machine code corresponding to the given instruction descriptor
- *  to the code block at '*dp'; the base of the code block is 'bp', and 'ig'
- *  is the instruction group that contains the instruction. Updates '*dp' to
- *  point past the generated code, and returns the size of the instruction
- *  descriptor in bytes.
- */
-
-#ifdef _PREFAST_
-#pragma warning(push)
-#pragma warning(disable : 21000) // Suppress PREFast warning about overly large function
-#endif
-size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
-{
-    assert(false);
-    return 0;
-}
-#ifdef _PREFAST_
-#pragma warning(pop)
-#endif
-
-//emitter::insFormat emitter::getMemoryOperation(instrDesc* id)
-//{
-//    assert(false);
-//}
-
-#if defined(DEBUG) || defined(LATE_DISASM)
-
-//----------------------------------------------------------------------------------------
-// getInsExecutionCharacteristics:
-//    Returns the current instruction execution characteristics
-//
-// Arguments:
-//    id  - The current instruction descriptor to be evaluated
-//
-// Return Value:
-//    A struct containing the current instruction execution characteristics
-//
-// Notes:
-//    The instruction latencies and throughput values returned by this function
-//    are for the Intel Skylake-X processor and are from either:
-//      1.  Agner.org - https://www.agner.org/optimize/instruction_tables.pdf
-//      2.  uops.info - https://uops.info/table.html
-//
-emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(instrDesc* id)
-{
-    assert(false);
-    insExecutionCharacteristics result;
-    result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL;
-    return result;
-}
-
-#endif // defined(DEBUG) || defined(LATE_DISASM)
-
-/*****************************************************************************/
-/*****************************************************************************/
-
-#endif // defined(TARGET_XARCH)
diff --git a/src/coreclr/jit/emitwasm.h b/src/coreclr/jit/emitwasm.h
deleted file mode 100644
index 812278701f2f..000000000000
--- a/src/coreclr/jit/emitwasm.h
+++ /dev/null
@@ -1,573 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
-
-/************************************************************************/
-/*           Public inline informational methods                        */
-/************************************************************************/
-
-public:
-inline static bool isGeneralRegister(regNumber reg)
-{
-    return (reg <= REG_INT_LAST);
-}
-
-inline static bool isFloatReg(regNumber reg)
-{
-    return (reg >= REG_FP_FIRST && reg <= REG_FP_LAST);
-}
-
-inline static bool isDoubleReg(regNumber reg)
-{
-    return isFloatReg(reg);
-}
-
-/************************************************************************/
-/*         Routines that compute the size of / encode instructions      */
-/************************************************************************/
-
-// code_t is a type used to accumulate bits of opcode + prefixes. On amd64, it must be 64 bits
-// to support the REX prefixes. On both x86 and amd64, it must be 64 bits to support AVX, with
-// its 3-byte VEX prefix.
-typedef unsigned __int64 code_t;
-
-struct CnsVal
-{
-    ssize_t cnsVal;
-    bool    cnsReloc;
-};
-
-UNATIVE_OFFSET emitInsSize(code_t code);
-UNATIVE_OFFSET emitInsSizeSV(code_t code, int var, int dsp);
-UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp);
-UNATIVE_OFFSET emitInsSizeSV(instrDesc* id, code_t code, int var, int dsp, int val);
-UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code);
-UNATIVE_OFFSET emitInsSizeRR(instrDesc* id, code_t code, int val);
-UNATIVE_OFFSET emitInsSizeRR(instruction ins, regNumber reg1, regNumber reg2, emitAttr attr);
-UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code);
-UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
-UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
-UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val);
-
-BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst);
-BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
-BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
-BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
-
-BYTE* emitOutputR(BYTE* dst, instrDesc* id);
-BYTE* emitOutputRI(BYTE* dst, instrDesc* id);
-BYTE* emitOutputRR(BYTE* dst, instrDesc* id);
-BYTE* emitOutputIV(BYTE* dst, instrDesc* id);
-
-BYTE* emitOutputRRR(BYTE* dst, instrDesc* id);
-
-BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
-
-unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
-unsigned emitGetRexPrefixSize(instruction ins);
-unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
-unsigned emitGetPrefixSize(code_t code);
-unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
-
-unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
-unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);
-code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code);
-unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code);
-
-code_t insEncodeMRreg(instruction ins, code_t code);
-code_t insEncodeRMreg(instruction ins, code_t code);
-code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code);
-code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size);
-code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size);
-
-unsigned insSSval(unsigned scale);
-
-bool IsAVXInstruction(instruction ins);
-code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code);
-
-code_t AddRexWPrefix(instruction ins, code_t code);
-code_t AddRexRPrefix(instruction ins, code_t code);
-code_t AddRexXPrefix(instruction ins, code_t code);
-code_t AddRexBPrefix(instruction ins, code_t code);
-code_t AddRexPrefix(instruction ins, code_t code);
-
-bool EncodedBySSE38orSSE3A(instruction ins);
-bool Is4ByteSSEInstruction(instruction ins);
-
-bool AreUpper32BitsZero(regNumber reg);
-
-bool AreFlagsSetToZeroCmp(regNumber reg, emitAttr opSize, bool needsOCFlags);
-
-bool hasRexPrefix(code_t code)
-{
-#ifdef TARGET_AMD64
-    const code_t REX_PREFIX_MASK = 0xFF00000000LL;
-    return (code & REX_PREFIX_MASK) != 0;
-#else  // !TARGET_AMD64
-    return false;
-#endif // !TARGET_AMD64
-}
-
-// 3-byte VEX prefix starts with byte 0xC4
-#define VEX_PREFIX_MASK_3BYTE 0xFF000000000000ULL
-#define VEX_PREFIX_CODE_3BYTE 0xC4000000000000ULL
-
-bool TakesVexPrefix(instruction ins);
-
-// Returns true if the instruction encoding already contains VEX prefix
-bool hasVexPrefix(code_t code)
-{
-    return (code & VEX_PREFIX_MASK_3BYTE) == VEX_PREFIX_CODE_3BYTE;
-}
-code_t AddVexPrefix(instruction ins, code_t code, emitAttr attr);
-code_t AddVexPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
-{
-    if (TakesVexPrefix(ins))
-    {
-        code = AddVexPrefix(ins, code, size);
-    }
-    return code;
-}
-code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size)
-{
-    if (TakesVexPrefix(ins) && !hasVexPrefix(code))
-    {
-        code = AddVexPrefix(ins, code, size);
-    }
-    return code;
-}
-
-bool useVEXEncodings;
-bool UseVEXEncoding()
-{
-    return useVEXEncodings;
-}
-void SetUseVEXEncoding(bool value)
-{
-    useVEXEncodings = value;
-}
-
-bool containsAVXInstruction = false;
-bool ContainsAVX()
-{
-    return containsAVXInstruction;
-}
-void SetContainsAVX(bool value)
-{
-    containsAVXInstruction = value;
-}
-
-bool contains256bitAVXInstruction = false;
-bool Contains256bitAVX()
-{
-    return contains256bitAVXInstruction;
-}
-void SetContains256bitAVX(bool value)
-{
-    contains256bitAVXInstruction = value;
-}
-
-bool IsDstDstSrcAVXInstruction(instruction ins);
-bool IsDstSrcSrcAVXInstruction(instruction ins);
-bool IsThreeOperandAVXInstruction(instruction ins)
-{
-    return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins));
-}
-bool isAvxBlendv(instruction ins)
-{
-    return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
-}
-bool isSse41Blendv(instruction ins)
-{
-    return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb;
-}
-bool isPrefetch(instruction ins)
-{
-    return (ins == INS_prefetcht0) || (ins == INS_prefetcht1) || (ins == INS_prefetcht2) || (ins == INS_prefetchnta);
-}
-
-/************************************************************************/
-/*             Debug-only routines to display instructions              */
-/************************************************************************/
-
-#ifdef DEBUG
-
-const char* emitFPregName(unsigned reg, bool varName = true);
-
-void emitDispReloc(ssize_t value);
-void emitDispAddrMode(instrDesc* id, bool noDetail = false);
-void emitDispShift(instruction ins, int cnt = 0);
-
-void emitDispIns(instrDesc* id,
-                 bool       isNew,
-                 bool       doffs,
-                 bool       asmfm,
-                 unsigned   offs = 0,
-                 BYTE*      code = nullptr,
-                 size_t     sz   = 0,
-                 insGroup*  ig   = nullptr);
-
-const char* emitXMMregName(unsigned reg);
-const char* emitYMMregName(unsigned reg);
-
-#endif
-
-/************************************************************************/
-/*  Private members that deal with target-dependent instr. descriptors  */
-/************************************************************************/
-
-private:
-void emitSetAmdDisp(instrDescAmd* id, ssize_t dsp);
-instrDesc* emitNewInstrAmd(emitAttr attr, ssize_t dsp);
-instrDesc* emitNewInstrAmdCns(emitAttr attr, ssize_t dsp, int cns);
-
-instrDesc* emitNewInstrCallDir(int              argCnt,
-                               VARSET_VALARG_TP GCvars,
-                               regMaskTP        gcrefRegs,
-                               regMaskTP        byrefRegs,
-                               emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize));
-
-instrDesc* emitNewInstrCallInd(int              argCnt,
-                               ssize_t          disp,
-                               VARSET_VALARG_TP GCvars,
-                               regMaskTP        gcrefRegs,
-                               regMaskTP        byrefRegs,
-                               emitAttr retSize MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize));
-
-void emitGetInsCns(instrDesc* id, CnsVal* cv);
-ssize_t emitGetInsAmdCns(instrDesc* id, CnsVal* cv);
-void emitGetInsDcmCns(instrDesc* id, CnsVal* cv);
-ssize_t emitGetInsAmdAny(instrDesc* id);
-
-/************************************************************************/
-/*               Private helpers for instruction output                 */
-/************************************************************************/
-
-private:
-insFormat emitInsModeFormat(instruction ins, insFormat base, insFormat FPld, insFormat FPst);
-
-bool emitVerifyEncodable(instruction ins, emitAttr size, regNumber reg1, regNumber reg2 = REG_NA);
-
-bool emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id);
-
-#if FEATURE_FIXED_OUT_ARGS
-void emitAdjustStackDepthPushPop(instruction ins)
-{
-}
-void emitAdjustStackDepth(instruction ins, ssize_t val)
-{
-}
-#else  // !FEATURE_FIXED_OUT_ARGS
-void emitAdjustStackDepthPushPop(instruction ins);
-void emitAdjustStackDepth(instruction ins, ssize_t val);
-#endif // !FEATURE_FIXED_OUT_ARGS
-
-/*****************************************************************************
-*
-*  Convert between an index scale in bytes to a smaller encoding used for
-*  storage in instruction descriptors.
-*/
-
-inline emitter::opSize emitEncodeScale(size_t scale)
-{
-    assert(scale == 1 || scale == 2 || scale == 4 || scale == 8);
-
-    return emitSizeEncode[scale - 1];
-}
-
-inline emitAttr emitDecodeScale(unsigned ensz)
-{
-    assert(ensz < 4);
-
-    return emitter::emitSizeDecode[ensz];
-}
-
-/************************************************************************/
-/*           The public entry points to output instructions             */
-/************************************************************************/
-
-public:
-void emitLoopAlign(unsigned short paddingBytes = 15);
-
-void emitLongLoopAlign(unsigned short alignmentBoundary);
-
-void emitIns(instruction ins);
-
-void emitIns(instruction ins, emitAttr attr);
-
-void emitInsRMW(instruction inst, emitAttr attr, GenTreeStoreInd* storeInd, GenTree* src);
-
-void emitInsRMW(instruction inst, emitAttr attr, GenTreeStoreInd* storeInd);
-
-void emitIns_Nop(unsigned size);
-
-void emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val);
-
-void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
-
-void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs);
-
-void emitIns_R_I(instruction ins, emitAttr attr, regNumber reg, ssize_t val);
-
-void emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2);
-
-void emitIns_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int ival);
-
-void emitIns_AR(instruction ins, emitAttr attr, regNumber base, int offs);
-
-void emitIns_AR_R_R(instruction ins, emitAttr attr, regNumber op2Reg, regNumber op3Reg, regNumber base, int offs);
-
-void emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir);
-
-void emitIns_R_A_I(instruction ins, emitAttr attr, regNumber reg1, GenTreeIndir* indir, int ival);
-
-void emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, regNumber base, int offs, int ival);
-
-void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
-
-void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival);
-
-void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir);
-
-void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs);
-
-void emitIns_R_AR_R(instruction ins,
-                    emitAttr    attr,
-                    regNumber   reg1,
-                    regNumber   reg2,
-                    regNumber   base,
-                    regNumber   index,
-                    int         scale,
-                    int         offs);
-
-void emitIns_R_R_C(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs);
-
-void emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs);
-
-void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3);
-
-void emitIns_R_R_A_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt);
-void emitIns_R_R_AR_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival);
-void emitIns_S_R_I(instruction ins, emitAttr attr, int varNum, int offs, regNumber reg, int ival);
-
-void emitIns_A_R_I(instruction ins, emitAttr attr, GenTreeIndir* indir, regNumber reg, int imm);
-
-void emitIns_R_R_C_I(
-    instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival);
-
-void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, int ival);
-
-void emitIns_R_R_S_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival);
-
-void emitIns_R_R_A_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, GenTreeIndir* indir);
-
-void emitIns_R_R_AR_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, regNumber base, int offs);
-
-void emitIns_R_R_C_R(instruction          ins,
-                     emitAttr             attr,
-                     regNumber            targetReg,
-                     regNumber            op1Reg,
-                     regNumber            op3Reg,
-                     CORINFO_FIELD_HANDLE fldHnd,
-                     int                  offs);
-
-void emitIns_R_R_S_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op3Reg, int varx, int offs);
-
-void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4);
-
-void emitIns_S(instruction ins, emitAttr attr, int varx, int offs);
-
-void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
-
-void emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
-
-void emitIns_S_I(instruction ins, emitAttr attr, int varx, int offs, int val);
-
-void emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
-
-void emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fldHnd, regNumber reg, int offs);
-
-void emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs, int val);
-
-void emitIns_IJ(emitAttr attr, regNumber reg, unsigned base);
-
-void emitIns_J_S(instruction ins, emitAttr attr, BasicBlock* dst, int varx, int offs);
-
-void emitIns_R_L(instruction ins, emitAttr attr, BasicBlock* dst, regNumber reg);
-
-void emitIns_R_D(instruction ins, emitAttr attr, unsigned offs, regNumber reg);
-
-void emitIns_I_AR(instruction ins, emitAttr attr, int val, regNumber reg, int offs);
-
-void emitIns_I_AI(instruction ins, emitAttr attr, int val, ssize_t disp);
-
-void emitIns_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber base, int disp);
-
-void emitIns_R_AI(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp);
-
-void emitIns_AR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, cnsval_ssize_t disp);
-
-void emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp);
-
-void emitIns_I_ARR(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, int disp);
-
-void emitIns_R_ARR(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp);
-
-void emitIns_ARR_R(instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, int disp);
-
-void emitIns_I_ARX(instruction ins, emitAttr attr, int val, regNumber reg, regNumber rg2, unsigned mul, int disp);
-
-void emitIns_R_ARX(
-    instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp);
-
-void emitIns_ARX_R(instruction    ins,
-                   emitAttr       attr,
-                   regNumber      reg,
-                   regNumber      base,
-                   regNumber      index,
-                   unsigned       scale,
-                   cnsval_ssize_t disp);
-
-void emitIns_I_AX(instruction ins, emitAttr attr, int val, regNumber reg, unsigned mul, int disp);
-
-void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
-
-void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp);
-
-void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival);
-
-void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir);
-void emitIns_SIMD_R_R_AR(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset);
-void emitIns_SIMD_R_R_C(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
-void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg);
-void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs);
-
-#ifdef FEATURE_HW_INTRINSICS
-void emitIns_SIMD_R_R_A_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival);
-void emitIns_SIMD_R_R_AR_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int ival);
-void emitIns_SIMD_R_R_C_I(instruction          ins,
-                          emitAttr             attr,
-                          regNumber            targetReg,
-                          regNumber            op1Reg,
-                          CORINFO_FIELD_HANDLE fldHnd,
-                          int                  offs,
-                          int                  ival);
-void emitIns_SIMD_R_R_R_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int ival);
-void emitIns_SIMD_R_R_S_I(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, int ival);
-
-void emitIns_SIMD_R_R_R_A(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir);
-void emitIns_SIMD_R_R_R_AR(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base);
-void emitIns_SIMD_R_R_R_C(instruction          ins,
-                          emitAttr             attr,
-                          regNumber            targetReg,
-                          regNumber            op1Reg,
-                          regNumber            op2Reg,
-                          CORINFO_FIELD_HANDLE fldHnd,
-                          int                  offs);
-void emitIns_SIMD_R_R_R_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg);
-void emitIns_SIMD_R_R_R_S(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);
-
-void emitIns_SIMD_R_R_A_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTreeIndir* indir);
-void emitIns_SIMD_R_R_AR_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber base);
-void emitIns_SIMD_R_R_C_R(instruction          ins,
-                          emitAttr             attr,
-                          regNumber            targetReg,
-                          regNumber            op1Reg,
-                          regNumber            op2Reg,
-                          CORINFO_FIELD_HANDLE fldHnd,
-                          int                  offs);
-void emitIns_SIMD_R_R_S_R(
-    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);
-#endif // FEATURE_HW_INTRINSICS
-
-enum EmitCallType
-{
-    EC_FUNC_TOKEN,       //   Direct call to a helper/static/nonvirtual/global method
-    EC_FUNC_TOKEN_INDIR, // Indirect call to a helper/static/nonvirtual/global method
-    EC_FUNC_ADDR,        // Direct call to an absolute address
-
-    EC_FUNC_VIRTUAL, // Call to a virtual method (using the vtable)
-    EC_INDIR_R,      // Indirect call via register
-    EC_INDIR_SR,     // Indirect call via stack-reference (local var)
-    EC_INDIR_C,      // Indirect call via static class var
-    EC_INDIR_ARD,    // Indirect call via an addressing mode
-
-    EC_COUNT
-};
-
-// clang-format off
-void emitIns_Call(EmitCallType          callType,
-                  CORINFO_METHOD_HANDLE methHnd,
-                  INDEBUG_LDISASM_COMMA(CORINFO_SIG_INFO* sigInfo) // used to report call sites to the EE
-                  void*                 addr,
-                  ssize_t               argSize,
-                  emitAttr              retSize
-                  MULTIREG_HAS_SECOND_GC_RET_ONLY_ARG(emitAttr secondRetSize),
-                  VARSET_VALARG_TP      ptrVars,
-                  regMaskTP             gcrefRegs,
-                  regMaskTP             byrefRegs,
-                  IL_OFFSETX            ilOffset = BAD_IL_OFFSET,
-                  regNumber             ireg     = REG_NA,
-                  regNumber             xreg     = REG_NA,
-                  unsigned              xmul     = 0,
-                  ssize_t               disp     = 0,
-                  bool                  isJump   = false);
-// clang-format on
-
-#ifdef TARGET_AMD64
-// Is the last instruction emitted a call instruction?
-bool emitIsLastInsCall();
-
-// Insert a NOP at the end of the the current instruction group if the last emitted instruction was a 'call',
-// because the next instruction group will be an epilog.
-void emitOutputPreEpilogNOP();
-#endif // TARGET_AMD64
-
-/*****************************************************************************
- *
- *  Given a jump, return true if it's a conditional jump.
- */
-
-inline bool emitIsCondJump(instrDesc* jmp)
-{
-    instruction ins = jmp->idIns();
-
-    assert(jmp->idInsFmt() == IF_LABEL);
-
-    return (ins != INS_call && ins != INS_jmp);
-}
-
-/*****************************************************************************
- *
- *  Given a jump, return true if it's an unconditional jump.
- */
-
-inline bool emitIsUncondJump(instrDesc* jmp)
-{
-    instruction ins = jmp->idIns();
-
-    assert(jmp->idInsFmt() == IF_LABEL);
-
-    return (ins == INS_jmp);
-}
-
-#endif // TARGET_XARCH
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index aaa07b087400..ec9c091b5d45 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -15,7 +15,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #pragma hdrstop
 #endif
 
-#if defined(TARGET_XARCH) 
+#if defined(TARGET_XARCH)
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp
index 8941c5b838e3..2097d13ebcb9 100644
--- a/src/coreclr/jit/fgopt.cpp
+++ b/src/coreclr/jit/fgopt.cpp
@@ -2478,7 +2478,9 @@ bool Compiler::fgOptimizeEmptyBlock(BasicBlock* block)
                         {
                             LIR::AsRange(block).InsertAtEnd(nop);
                             LIR::ReadOnlyRange range(nop, nop);
+#ifndef TARGET_WASM
                             m_pLowering->LowerRange(block, range);
+#endif // TARGET_WASM
                         }
                         else
                         {
@@ -2821,7 +2823,9 @@ bool Compiler::fgOptimizeSwitchBranches(BasicBlock* block)
         {
             blockRange->InsertAfter(switchVal, zeroConstNode, condNode);
             LIR::ReadOnlyRange range(zeroConstNode, switchTree);
+#ifndef TARGET_WASM
             m_pLowering->LowerRange(block, range);
+#endif // TARGET_WASM
         }
         else if (fgStmtListThreaded)
         {
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 594a59af14ef..c3283b8a4040 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -1518,6 +1518,7 @@ inline void Compiler::fgMarkLoopHead(BasicBlock* block)
         }
     }
 
+#ifndef TARGET_WASM
     /*
      *  We have to make this method fully interruptible since we can not
      *  ensure that this loop will execute a call every time it loops.
@@ -1526,6 +1527,7 @@ inline void Compiler::fgMarkLoopHead(BasicBlock* block)
      */
 
     assert(!codeGen->isGCTypeFixed());
+#endif // !TARGET_WASM
 
     if (!compCanEncodePtrArgCntMax())
     {
diff --git a/src/coreclr/jit/gcencode.cpp b/src/coreclr/jit/gcencode.cpp
index bfe9dbe3ee91..9a0c700ddff2 100644
--- a/src/coreclr/jit/gcencode.cpp
+++ b/src/coreclr/jit/gcencode.cpp
@@ -11,7 +11,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-
+#ifndef TARGET_WASM
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -4832,6 +4832,7 @@ void GCInfo::gcInfoRecordGCStackArgsDead(GcInfoEncoder* gcInfoEncoder,
 #undef GCENCODER_WITH_LOGGING
 
 #endif // !JIT32_GCENCODER
+#endif // !TARGET_WASM
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/gcinfo.cpp b/src/coreclr/jit/gcinfo.cpp
index b75ddb1d86fb..81ffa1115d4e 100644
--- a/src/coreclr/jit/gcinfo.cpp
+++ b/src/coreclr/jit/gcinfo.cpp
@@ -76,6 +76,7 @@ void GCInfo::gcResetForBB()
     VarSetOps::AssignNoCopy(compiler, gcVarPtrSetCur, VarSetOps::MakeEmpty(compiler));
 }
 
+#ifndef TARGET_WASM
 #ifdef DEBUG
 
 /*****************************************************************************
@@ -220,6 +221,7 @@ void GCInfo::gcMarkRegPtrVal(regNumber reg, var_types type)
             break;
     }
 }
+#endif !TARGET_WASM
 
 /*****************************************************************************/
 
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index b80564a58b4e..f5a44c7c9938 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -2855,6 +2855,466 @@ bool Compiler::gtCanSwapOrder(GenTree* firstNode, GenTree* secondNode)
     return canSwap;
 }
 
+#ifdef TARGET_WASM
+bool genCreateAddrMode(Compiler* compiler, GenTree* addr,
+    bool      fold,
+    bool* revPtr,
+    GenTree** rv1Ptr,
+    GenTree** rv2Ptr,
+#if SCALED_ADDR_MODES
+    unsigned* mulPtr,
+#endif // SCALED_ADDR_MODES
+    ssize_t* cnsPtr)
+{
+    /*
+        The following indirections are valid address modes on x86/x64:
+
+            [                  icon]      * not handled here
+            [reg                   ]
+            [reg             + icon]
+            [reg1 +     reg2       ]
+            [reg1 +     reg2 + icon]
+            [reg1 + 2 * reg2       ]
+            [reg1 + 4 * reg2       ]
+            [reg1 + 8 * reg2       ]
+            [       2 * reg2 + icon]
+            [       4 * reg2 + icon]
+            [       8 * reg2 + icon]
+            [reg1 + 2 * reg2 + icon]
+            [reg1 + 4 * reg2 + icon]
+            [reg1 + 8 * reg2 + icon]
+
+        The following indirections are valid address modes on arm64:
+
+            [reg]
+            [reg  + icon]
+            [reg1 + reg2]
+            [reg1 + reg2 * natural-scale]
+
+     */
+
+     /* All indirect address modes require the address to be an addition */
+
+    if (addr->gtOper != GT_ADD)
+    {
+        return false;
+    }
+
+    // Can't use indirect addressing mode as we need to check for overflow.
+    // Also, can't use 'lea' as it doesn't set the flags.
+
+    if (addr->gtOverflow())
+    {
+        return false;
+    }
+
+    GenTree* rv1 = nullptr;
+    GenTree* rv2 = nullptr;
+
+    GenTree* op1;
+    GenTree* op2;
+
+    ssize_t cns;
+#if SCALED_ADDR_MODES
+    unsigned mul;
+#endif // SCALED_ADDR_MODES
+
+    GenTree* tmp;
+
+    /* What order are the sub-operands to be evaluated */
+
+    if (addr->gtFlags & GTF_REVERSE_OPS)
+    {
+        op1 = addr->AsOp()->gtOp2;
+        op2 = addr->AsOp()->gtOp1;
+    }
+    else
+    {
+        op1 = addr->AsOp()->gtOp1;
+        op2 = addr->AsOp()->gtOp2;
+    }
+
+    bool rev = false; // Is op2 first in the evaluation order?
+
+    /*
+        A complex address mode can combine the following operands:
+
+            op1     ...     base address
+            op2     ...     optional scaled index
+#if SCALED_ADDR_MODES
+            mul     ...     optional multiplier (2/4/8) for op2
+#endif
+            cns     ...     optional displacement
+
+        Here we try to find such a set of operands and arrange for these
+        to sit in registers.
+     */
+
+    cns = 0;
+#if SCALED_ADDR_MODES
+    mul = 0;
+#endif // SCALED_ADDR_MODES
+
+AGAIN:
+    /* We come back to 'AGAIN' if we have an add of a constant, and we are folding that
+       constant, or we have gone through a GT_NOP or GT_COMMA node. We never come back
+       here if we find a scaled index.
+    */
+    CLANG_FORMAT_COMMENT_ANCHOR;
+
+#if SCALED_ADDR_MODES
+    assert(mul == 0);
+#endif // SCALED_ADDR_MODES
+
+    /* Special case: keep constants as 'op2' */
+
+    if (op1->IsCnsIntOrI())
+    {
+        // Presumably op2 is assumed to not be a constant (shouldn't happen if we've done constant folding)?
+        tmp = op1;
+        op1 = op2;
+        op2 = tmp;
+    }
+
+    /* Check for an addition of a constant */
+
+    if (op2->IsIntCnsFitsInI32() && (op2->gtType != TYP_REF) && FitsIn<INT32>(cns + op2->AsIntConCommon()->IconValue()))
+    {
+        // We should not be building address modes out of non-foldable constants
+        assert(op2->AsIntConCommon()->ImmedValCanBeFolded(compiler, addr->OperGet()));
+
+        /* We're adding a constant */
+
+        cns += op2->AsIntConCommon()->IconValue();
+
+#if defined(TARGET_ARMARCH)
+        if (cns == 0)
+#endif
+        {
+            /* Inspect the operand the constant is being added to */
+
+            switch (op1->gtOper)
+            {
+            case GT_ADD:
+
+                if (op1->gtOverflow())
+                {
+                    break;
+                }
+
+                op2 = op1->AsOp()->gtOp2;
+                op1 = op1->AsOp()->gtOp1;
+
+                goto AGAIN;
+
+#if SCALED_ADDR_MODES && !defined(TARGET_ARMARCH)
+                // TODO-ARM64-CQ, TODO-ARM-CQ: For now we don't try to create a scaled index.
+            case GT_MUL:
+                if (op1->gtOverflow())
+                {
+                    return false; // Need overflow check
+                }
+
+                FALLTHROUGH;
+
+            case GT_LSH:
+
+                mul = op1->GetScaledIndex();
+                if (mul)
+                {
+                    /* We can use "[mul*rv2 + icon]" */
+
+                    rv1 = nullptr;
+                    rv2 = op1->AsOp()->gtOp1;
+
+                    goto FOUND_AM;
+                }
+                break;
+#endif // SCALED_ADDR_MODES && !defined(TARGET_ARMARCH)
+
+            default:
+                break;
+            }
+        }
+
+        /* The best we can do is "[rv1 + icon]" */
+
+        rv1 = op1;
+        rv2 = nullptr;
+
+        goto FOUND_AM;
+    }
+
+    // op2 is not a constant. So keep on trying.
+
+    /* Neither op1 nor op2 are sitting in a register right now */
+
+    switch (op1->gtOper)
+    {
+#if !defined(TARGET_ARMARCH)
+        // TODO-ARM64-CQ, TODO-ARM-CQ: For now we don't try to create a scaled index.
+    case GT_ADD:
+
+        if (op1->gtOverflow())
+        {
+            break;
+        }
+
+        if (op1->AsOp()->gtOp2->IsIntCnsFitsInI32() &&
+            FitsIn<INT32>(cns + op1->AsOp()->gtOp2->AsIntCon()->gtIconVal))
+        {
+            cns += op1->AsOp()->gtOp2->AsIntCon()->gtIconVal;
+            op1 = op1->AsOp()->gtOp1;
+
+            goto AGAIN;
+        }
+
+        break;
+
+#if SCALED_ADDR_MODES
+
+    case GT_MUL:
+
+        if (op1->gtOverflow())
+        {
+            break;
+        }
+
+        FALLTHROUGH;
+
+    case GT_LSH:
+
+        mul = op1->GetScaledIndex();
+        if (mul)
+        {
+            /* 'op1' is a scaled value */
+
+            rv1 = op2;
+            rv2 = op1->AsOp()->gtOp1;
+
+            int argScale;
+            while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0)
+            {
+                if (jitIsScaleIndexMul(argScale * mul))
+                {
+                    mul = mul * argScale;
+                    rv2 = rv2->AsOp()->gtOp1;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            noway_assert(rev == false);
+            rev = true;
+
+            goto FOUND_AM;
+        }
+        break;
+
+#endif // SCALED_ADDR_MODES
+#endif // !TARGET_ARMARCH
+
+    case GT_NOP:
+
+        op1 = op1->AsOp()->gtOp1;
+        goto AGAIN;
+
+    case GT_COMMA:
+
+        op1 = op1->AsOp()->gtOp2;
+        goto AGAIN;
+
+    default:
+        break;
+    }
+
+    noway_assert(op2);
+    switch (op2->gtOper)
+    {
+#if !defined(TARGET_ARMARCH)
+        // TODO-ARM64-CQ, TODO-ARM-CQ: For now we don't try to create a scaled index.
+    case GT_ADD:
+
+        if (op2->gtOverflow())
+        {
+            break;
+        }
+
+        if (op2->AsOp()->gtOp2->IsIntCnsFitsInI32() &&
+            FitsIn<INT32>(cns + op2->AsOp()->gtOp2->AsIntCon()->gtIconVal))
+        {
+            cns += op2->AsOp()->gtOp2->AsIntCon()->gtIconVal;
+            op2 = op2->AsOp()->gtOp1;
+
+            goto AGAIN;
+        }
+
+        break;
+
+#if SCALED_ADDR_MODES
+
+    case GT_MUL:
+
+        if (op2->gtOverflow())
+        {
+            break;
+        }
+
+        FALLTHROUGH;
+
+    case GT_LSH:
+
+        mul = op2->GetScaledIndex();
+        if (mul)
+        {
+            // 'op2' is a scaled value...is it's argument also scaled?
+            int argScale;
+            rv2 = op2->AsOp()->gtOp1;
+            while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0)
+            {
+                if (jitIsScaleIndexMul(argScale * mul))
+                {
+                    mul = mul * argScale;
+                    rv2 = rv2->AsOp()->gtOp1;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            rv1 = op1;
+
+            goto FOUND_AM;
+        }
+        break;
+
+#endif // SCALED_ADDR_MODES
+#endif // !TARGET_ARMARCH
+
+    case GT_NOP:
+
+        op2 = op2->AsOp()->gtOp1;
+        goto AGAIN;
+
+    case GT_COMMA:
+
+        op2 = op2->AsOp()->gtOp2;
+        goto AGAIN;
+
+    default:
+        break;
+    }
+
+    /* The best we can do "[rv1 + rv2]" or "[rv1 + rv2 + cns]" */
+
+    rv1 = op1;
+    rv2 = op2;
+#ifdef TARGET_ARM64
+    assert(cns == 0);
+#endif
+
+FOUND_AM:
+
+    if (rv2)
+    {
+        /* Make sure a GC address doesn't end up in 'rv2' */
+
+        if (varTypeIsGC(rv2->TypeGet()))
+        {
+            noway_assert(rv1 && !varTypeIsGC(rv1->TypeGet()));
+
+            tmp = rv1;
+            rv1 = rv2;
+            rv2 = tmp;
+
+            rev = !rev;
+        }
+
+        /* Special case: constant array index (that is range-checked) */
+
+        if (fold)
+        {
+            ssize_t  tmpMul;
+            GenTree* index;
+
+            if ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (rv2->AsOp()->gtOp2->IsCnsIntOrI()))
+            {
+                /* For valuetype arrays where we can't use the scaled address
+                   mode, rv2 will point to the scaled index. So we have to do
+                   more work */
+                assert(false); //TODO: this method is copied from Compiler, but not properly refactored so this call is not available, put off for now and investigate when it gets hit
+                index = NULL;
+                tmpMul = 1; // compiler->optGetArrayRefScaleAndIndex(rv2, &index DEBUGARG(false));
+                if (mul)
+                {
+                    tmpMul *= mul;
+                }
+            }
+            else
+            {
+                /* May be a simple array. rv2 will points to the actual index */
+
+                index = rv2;
+                tmpMul = mul;
+            }
+
+            /* Get hold of the array index and see if it's a constant */
+            if (index->IsIntCnsFitsInI32())
+            {
+                /* Get hold of the index value */
+                ssize_t ixv = index->AsIntConCommon()->IconValue();
+
+#if SCALED_ADDR_MODES
+                /* Scale the index if necessary */
+                if (tmpMul)
+                {
+                    ixv *= tmpMul;
+                }
+#endif
+
+                if (FitsIn<INT32>(cns + ixv))
+                {
+                    /* Add the scaled index to the offset value */
+
+                    cns += ixv;
+
+#if SCALED_ADDR_MODES
+                    /* There is no scaled operand any more */
+                    mul = 0;
+#endif
+                    rv2 = nullptr;
+                }
+            }
+        }
+    }
+
+    // We shouldn't have [rv2*1 + cns] - this is equivalent to [rv1 + cns]
+    noway_assert(rv1 || mul != 1);
+
+    noway_assert(FitsIn<INT32>(cns));
+
+    if (rv1 == nullptr && rv2 == nullptr)
+    {
+        return false;
+    }
+
+    /* Success - return the various components to the caller */
+
+    *revPtr = rev;
+    *rv1Ptr = rv1;
+    *rv2Ptr = rv2;
+#if SCALED_ADDR_MODES
+    * mulPtr = mul;
+#endif
+    * cnsPtr = cns;
+
+    return true;
+}
+#endif // TARGET_WASM
+
 //------------------------------------------------------------------------
 // Given an address expression, compute its costs and addressing mode opportunities,
 // and mark addressing mode candidates as GTF_DONT_CSE.
@@ -2884,7 +3344,11 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
     GenTree* base; // This is the base of the address.
     GenTree* idx;  // This is the index.
 
+#ifdef TARGET_WASM
+    if (genCreateAddrMode(this, addr, false /*fold*/, &rev, &base, &idx,
+#else
     if (codeGen->genCreateAddrMode(addr, false /*fold*/, &rev, &base, &idx,
+#endif
 #if SCALED_ADDR_MODES
                                    &mul,
 #endif // SCALED_ADDR_MODES
@@ -2894,7 +3358,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // nodes with GTF_ADDRMODE_NO_CSE and calculate a more accurate cost.
 
         addr->gtFlags |= GTF_ADDRMODE_NO_CSE;
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
         // addrmodeCount is the count of items that we used to form
         // an addressing mode.  The maximum value is 4 when we have
         // all of these:   { base, idx, cns, mul }
@@ -3088,7 +3552,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // we have already found either a non-ADD op1 or a non-constant op2.
         gtWalkOp(&op1, &op2, nullptr, true);
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
         // For XARCH we will fold GT_ADDs in the op2 position into the addressing mode, so we call
         // gtWalkOp on both operands of the original GT_ADD.
         // This is not done for ARMARCH. Though the stated reason is that we don't try to create a
@@ -3098,7 +3562,7 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ
         // into the addressing mode.
         // Walk op2 looking for non-overflow GT_ADDs of constants.
         gtWalkOp(&op2, &op1, nullptr, true);
-#endif // defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#endif // defined(TARGET_XARCH) || defined(TARGET_WASM)
 
         // OK we are done walking the tree
         // Now assert that op1 and op2 correspond with base and idx
@@ -3331,7 +3795,7 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                 goto COMMON_CNS;
             }
 
-#elif defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#elif defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
 
             case GT_CNS_STR:
 #ifdef TARGET_AMD64
@@ -3666,7 +4130,7 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                         costEx = IND_COST_EX * 2;
                         costSz = 6;
                     }
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
                     costEx = 1;
                     costSz = 2;
 
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index 5d187615ad28..4ce71a5ac966 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -11,6 +11,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
+#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -65,7 +66,7 @@ const char* CodeGen::genInsName(instruction ins)
         #define INST9(id, nm, ldst, fmt, e1, e2, e3, e4, e5, e6, e7, e8, e9 ) nm,
         #include "instrs.h"
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
         #define INST0(id, nm, um, mr,                 flags) nm,
         #define INST1(id, nm, um, mr,                 flags) nm,
         #define INST2(id, nm, um, mr, mi,             flags) nm,
@@ -1819,7 +1820,7 @@ instruction CodeGen::ins_Copy(var_types dstType)
     {
         return INS_mov;
     }
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     if (varTypeIsSIMD(dstType))
     {
         return INS_movaps;
@@ -1858,7 +1859,7 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType)
     {
         return ins_Copy(dstType);
     }
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
     if (dstIsFloatReg)
     {
         return INS_mov_i2xmm;
@@ -2013,7 +2014,7 @@ instruction CodeGenInterface::ins_StoreFromSrc(regNumber srcReg, var_types dstTy
     return ins;
 }
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
 
 bool CodeGen::isMoveIns(instruction ins)
 {
@@ -2397,7 +2398,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
     }
 #endif // DEBUG
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
     // only full barrier needs to be emitted on Xarch
     if (barrierKind != BARRIER_FULL)
     {
@@ -2426,7 +2427,7 @@ void CodeGen::instGen_Set_Reg_To_Zero(emitAttr size, regNumber reg, insFlags fla
     GetEmitter()->emitIns_R_R(INS_xor, size, reg, reg);
 #elif defined(TARGET_ARMARCH)
     GetEmitter()->emitIns_R_I(INS_mov, size, reg, 0 ARM_ARG(flags));
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     GetEmitter()->emitIns_R_R(INS_xor, size, reg, reg);
 #else
 #error "Unknown TARGET"
@@ -2445,7 +2446,7 @@ void CodeGen::instGen_Compare_Reg_To_Zero(emitAttr size, regNumber reg)
     GetEmitter()->emitIns_R_R(INS_test, size, reg, reg);
 #elif defined(TARGET_ARMARCH)
     GetEmitter()->emitIns_R_I(INS_cmp, size, reg, 0);
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     GetEmitter()->emitIns_R_R(INS_test, size, reg, reg);
 #else
 #error "Unknown TARGET"
@@ -2459,7 +2460,7 @@ void CodeGen::instGen_Compare_Reg_To_Zero(emitAttr size, regNumber reg)
  */
 void CodeGen::instGen_Compare_Reg_To_Reg(emitAttr size, regNumber reg1, regNumber reg2)
 {
-#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH) || defined(TARGET_WASM) // TODO Wasm
     GetEmitter()->emitIns_R_R(INS_cmp, size, reg1, reg2);
 #else
 #error "Unknown TARGET"
@@ -2479,7 +2480,7 @@ void CodeGen::instGen_Compare_Reg_To_Imm(emitAttr size, regNumber reg, target_ss
     }
     else
     {
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
 #if defined(TARGET_AMD64)
         if ((EA_SIZE(size) == EA_8BYTE) && (((int)imm != (ssize_t)imm) || EA_IS_CNS_RELOC(size)))
         {
@@ -2535,6 +2536,7 @@ void CodeGen::instGen_Store_Reg_Into_Lcl(var_types dstType, regNumber srcReg, in
 
     GetEmitter()->emitIns_S_R(ins_Store(dstType), size, srcReg, varNum, offs);
 }
+#endif // !TARGET_WASM
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/jiteh.cpp b/src/coreclr/jit/jiteh.cpp
index 13f6e19df1d9..129cb61e06b1 100644
--- a/src/coreclr/jit/jiteh.cpp
+++ b/src/coreclr/jit/jiteh.cpp
@@ -891,7 +891,7 @@ unsigned Compiler::ehGetCallFinallyRegionIndex(unsigned finallyIndex, bool* inTr
     assert(finallyIndex != EHblkDsc::NO_ENCLOSING_INDEX);
     assert(ehGetDsc(finallyIndex)->HasFinallyHandler());
 
-#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM)
     return ehGetDsc(finallyIndex)->ebdGetEnclosingRegionIndex(inTryRegion);
 #else
     *inTryRegion = true;
@@ -1092,6 +1092,7 @@ void* Compiler::ehEmitCookie(BasicBlock* block)
     return cookie;
 }
 
+#ifndef TARGET_WASM
 /*****************************************************************************
  * Determine the emitter code offset for a block. If the block is a finally
  * target, choose the offset of the NOP padding that precedes the block.
@@ -1101,6 +1102,7 @@ UNATIVE_OFFSET Compiler::ehCodeOffset(BasicBlock* block)
 {
     return GetEmitter()->emitCodeOffset(ehEmitCookie(block), 0);
 }
+#endif // !TARGET_WASM
 
 /****************************************************************************/
 
@@ -2979,10 +2981,12 @@ void Compiler::dispOutgoingEHClause(unsigned num, const CORINFO_EH_CLAUSE& claus
 {
     if (opts.dspDiffable)
     {
+#ifndef TARGET_WASM
         /* (( brace matching editor workaround to compensate for the following line */
         printf("EH#%u: try [%s..%s) handled by [%s..%s) ", num, GetEmitter()->emitOffsetToLabel(clause.TryOffset),
                GetEmitter()->emitOffsetToLabel(clause.TryLength), GetEmitter()->emitOffsetToLabel(clause.HandlerOffset),
                GetEmitter()->emitOffsetToLabel(clause.HandlerLength));
+#endif // !TARGET_WASM
     }
     else
     {
@@ -3005,9 +3009,11 @@ void Compiler::dispOutgoingEHClause(unsigned num, const CORINFO_EH_CLAUSE& claus
         case CORINFO_EH_CLAUSE_FILTER:
             if (opts.dspDiffable)
             {
+#ifndef TARGET_WASM
                 /* ( brace matching editor workaround to compensate for the following line */
                 printf("filter at [%s..%s)", GetEmitter()->emitOffsetToLabel(clause.ClassToken),
                        GetEmitter()->emitOffsetToLabel(clause.HandlerOffset));
+#endif // !TARGET_WASM
             }
             else
             {
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index 70e54fc59005..9f56e8d3772b 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -344,12 +344,14 @@ void Compiler::lvaInitTypeRef()
     // emitter when the varNum is greater that 32767 (see emitLclVarAddr::initLclVarAddr)
     lvaAllocOutgoingArgSpaceVar();
 
+#ifndef TARGET_WASM
 #ifdef DEBUG
     if (verbose)
     {
         lvaTableDump(INITIAL_FRAME_LAYOUT);
     }
 #endif
+#endif //!TARGET_WASM
 }
 
 /*****************************************************************************/
@@ -423,8 +425,10 @@ void Compiler::lvaInitArgs(InitVarDscInfo* varDscInfo)
     noway_assert(varDscInfo->varNum == info.compArgsCount);
     assert(varDscInfo->intRegArgNum <= MAX_REG_ARG);
 
+#ifndef TARGET_WASM
     codeGen->intRegState.rsCalleeRegArgCount   = varDscInfo->intRegArgNum;
     codeGen->floatRegState.rsCalleeRegArgCount = varDscInfo->floatRegArgNum;
+#endif // !TARGET_WASM
 
 #if FEATURE_FASTTAILCALL
     // Save the stack usage information
@@ -2524,6 +2528,7 @@ void Compiler::lvaPromoteLongVars()
         }
     }
 
+#ifndef TARGET_WASM
 #ifdef DEBUG
     if (verbose)
     {
@@ -2531,6 +2536,7 @@ void Compiler::lvaPromoteLongVars()
         lvaTableDump();
     }
 #endif // DEBUG
+#endif //!TARGET_WASM
 }
 #endif // !defined(TARGET_64BIT)
 
@@ -4582,6 +4588,7 @@ inline void Compiler::lvaIncrementFrameSize(unsigned size)
     compLclFrameSize += size;
 }
 
+#ifndef TARGET_WASM
 /****************************************************************************
 *
 *  Return true if absolute offsets of temps are larger than vars, or in other
@@ -5700,7 +5707,7 @@ int Compiler::lvaAssignVirtualFrameOffsetToArg(unsigned lclNum,
 
 #if defined(TARGET_X86)
         argOffs += TARGET_POINTER_SIZE;
-#elif defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#elif defined(TARGET_AMD64) || defined(TARGET_WASM) // TODO Wasm
         // Register arguments on AMD64 also takes stack space. (in the backing store)
         varDsc->SetStackOffset(argOffs);
         argOffs += TARGET_POINTER_SIZE;
@@ -7713,6 +7720,7 @@ int Compiler::lvaGetInitialSPRelativeOffset(unsigned varNum)
 
     return lvaToInitialSPRelativeOffset(varDsc->GetStackOffset(), varDsc->lvFramePointerBased);
 }
+#endif // !TARGET_WASM
 
 // Given a local variable offset, and whether that offset is frame-pointer based, return its offset from Initial-SP.
 // This is used, for example, to figure out the offset of the frame pointer from Initial-SP.
diff --git a/src/coreclr/jit/liveness.cpp b/src/coreclr/jit/liveness.cpp
index 3d3c5481de28..b25f35e77621 100644
--- a/src/coreclr/jit/liveness.cpp
+++ b/src/coreclr/jit/liveness.cpp
@@ -131,10 +131,12 @@ void Compiler::fgLocalVarLiveness()
     {
         printf("*************** In fgLocalVarLiveness()\n");
 
+#ifndef TARGET_WASM
         if (compRationalIRForm)
         {
             lvaTableDump();
         }
+#endif //!TARGET_WASM
     }
 #endif // DEBUG
 
@@ -1030,7 +1032,9 @@ void Compiler::fgExtendDbgLifetimes()
 #if !defined(TARGET_64BIT) && !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
                     DecomposeLongs::DecomposeRange(this, initRange);
 #endif // !defined(TARGET_64BIT)
+#ifndef TARGET_WASM
                     m_pLowering->LowerRange(block, initRange);
+#endif // !TARGET_WASM
 
                     // Naively inserting the initializer at the end of the block may add code after the block's
                     // terminator, in which case the inserted code will never be executed (and the IR for the
@@ -1993,10 +1997,12 @@ void Compiler::fgComputeLifeLIR(VARSET_TP& life, BasicBlock* block, VARSET_VALAR
                                 store->OperIs(GT_STOREIND) ? store->AsStoreInd()->Data() : store->AsBlk()->Data();
                             data->SetUnusedValue();
 
+#ifndef TARGET_WASM
                             if (data->isIndir())
                             {
                                 Lowering::TransformUnusedIndirection(data->AsIndir(), this, block);
                             }
+#endif // !TARGET_WASM
 
                             fgRemoveDeadStoreLIR(store, block);
                         }
@@ -2115,12 +2121,14 @@ void Compiler::fgComputeLifeLIR(VARSET_TP& life, BasicBlock* block, VARSET_VALAR
             case GT_DYN_BLK:
             {
                 bool removed = fgTryRemoveNonLocal(node, &blockRange);
+#ifndef TARGET_WASM
                 if (!removed && node->IsUnusedValue())
                 {
                     // IR doesn't expect dummy uses of `GT_OBJ/BLK/DYN_BLK`.
                     JITDUMP("Transform an unused OBJ/BLK node [%06d]\n", dspTreeID(node));
                     Lowering::TransformUnusedIndirection(node->AsIndir(), this, block);
                 }
+#endif // !TARGET_WASM
             }
             break;
 
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 7d4725cb3ccd..5b2390b7506b 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -14,6 +14,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
+#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -5339,7 +5340,7 @@ GenTree* Lowering::LowerConstIntDivOrMod(GenTree* node)
             return nullptr;
         }
 
-#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM) // TODO Wasm
         ssize_t magic;
         int     shift;
 
@@ -6675,3 +6676,4 @@ bool Lowering::TryTransformStoreObjAsStoreInd(GenTreeBlk* blkNode)
     LowerStoreIndirCommon(blkNode);
     return true;
 }
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index e63955921580..c645409c03b5 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -10,7 +10,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-
+#ifndef TARGET_WASM
 #ifndef _LOWER_H_
 #define _LOWER_H_
 
@@ -97,7 +97,7 @@ class Lowering final : public Phase
     void ContainCheckCompare(GenTreeOp* node);
     void ContainCheckBinary(GenTreeOp* node);
     void ContainCheckBoundsChk(GenTreeBoundsChk* node);
-#if defined(TARGET_XARCH) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
     void ContainCheckFloatBinary(GenTreeOp* node);
     void ContainCheckIntrinsic(GenTreeOp* node);
 #endif // TARGET_XARCH
@@ -228,7 +228,7 @@ class Lowering final : public Phase
     // return true if this call target is within range of a pc-rel call on the machine
     bool IsCallTargetInRange(void* addr);
 
-#if defined(TARGET_XARCH) || defined (TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
     GenTree* PreferredRegOptionalOperand(GenTree* tree);
 
     // ------------------------------------------------------------------
@@ -579,3 +579,4 @@ class Lowering final : public Phase
 };
 
 #endif // _LOWER_H_
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp
index de515f120683..5551d5f036a7 100644
--- a/src/coreclr/jit/lsra.cpp
+++ b/src/coreclr/jit/lsra.cpp
@@ -89,6 +89,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
+#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -11299,3 +11300,4 @@ void LinearScan::verifyResolutionMove(GenTree* resolutionMove, LsraLocation curr
     }
 }
 #endif // DEBUG
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
index de77e0fa4cfe..60de4a035742 100644
--- a/src/coreclr/jit/lsra.h
+++ b/src/coreclr/jit/lsra.h
@@ -1,7 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 /*****************************************************************************/
-
+#ifndef TARGET_WASM
 #ifndef _LSRA_H_
 #define _LSRA_H_
 
@@ -735,7 +735,7 @@ class LinearScan : public LinearScanInterface
     // Hence the "SmallFPSet" has 5 elements.
     CLANG_FORMAT_COMMENT_ANCHOR;
 
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_AMD64) || defined(TARGET_WASM)
 #ifdef UNIX_AMD64_ABI
     // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
     static const regMaskTP LsraLimitSmallIntSet =
@@ -757,7 +757,7 @@ class LinearScan : public LinearScanInterface
 #elif defined(TARGET_X86)
     static const regMaskTP LsraLimitSmallIntSet = (RBM_EAX | RBM_ECX | RBM_EDI);
     static const regMaskTP LsraLimitSmallFPSet  = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7);
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #ifdef UNIX_AMD64_ABI
     // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
     static const regMaskTP LsraLimitSmallIntSet =
@@ -1733,7 +1733,7 @@ class LinearScan : public LinearScanInterface
 
     void setDelayFree(RefPosition* use);
     int BuildBinaryUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
     int BuildRMWUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
 #endif // !TARGET_XARCH
     // This is the main entry point for building the RefPositions for a node.
@@ -1754,7 +1754,7 @@ class LinearScan : public LinearScanInterface
     void BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCandidates, regMaskTP killMask);
 
     int BuildReturn(GenTree* tree);
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
     // This method, unlike the others, returns the number of sources, since it may be called when
     // 'tree' is contained.
     int BuildShiftRotate(GenTree* tree);
@@ -1775,7 +1775,7 @@ class LinearScan : public LinearScanInterface
     int BuildGCWriteBarrier(GenTree* tree);
     int BuildCast(GenTreeCast* cast);
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
     // returns true if the tree can use the read-modify-write memory instruction form
     bool isRMWRegOper(GenTree* tree);
     int BuildMul(GenTree* tree);
@@ -2386,3 +2386,4 @@ void dumpRegMask(regMaskTP regs);
 /*****************************************************************************/
 #endif //_LSRA_H_
 /*****************************************************************************/
+#endif // TARGET_WASM
diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp
index 06b6799c1824..3899abd600ac 100644
--- a/src/coreclr/jit/lsrabuild.cpp
+++ b/src/coreclr/jit/lsrabuild.cpp
@@ -13,6 +13,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
+#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -3905,3 +3906,4 @@ int LinearScan::BuildCmp(GenTree* tree)
     }
     return srcCount;
 }
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index ab274a1b923b..58478e05ad64 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -3065,7 +3065,7 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
 #elif defined(TARGET_X86)
 
         passUsingFloatRegs = false;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 
         passUsingFloatRegs = varTypeIsFloating(argx);
 
@@ -3118,7 +3118,7 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
                 assert(structSize == info.compCompHnd->getClassSize(objClass));
             }
         }
-#if defined(TARGET_AMD64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_AMD64) || defined(TARGET_WASM) // TODO Wasm
 #ifdef UNIX_AMD64_ABI
         if (!isStructArg)
         {
@@ -8587,8 +8587,13 @@ void Compiler::fgMorphTailCallViaJitHelper(GenTreeCall* call)
     assert(ppArg != nullptr);
     assert(*ppArg == nullptr);
 
+#ifndef TARGET_WASM
     unsigned nOldStkArgsWords =
         (compArgSize - (codeGen->intRegState.rsCalleeRegArgCount * REGSIZE_BYTES)) / REGSIZE_BYTES;
+#else
+    unsigned nOldStkArgsWords = 0;
+    assert(false); // TODO: Wasm: what to do here?
+#endif // !TARGET_WASM
     GenTree* arg3 = gtNewIconNode((ssize_t)nOldStkArgsWords, TYP_I_IMPL);
     *ppArg        = gtNewCallArgs(arg3); // numberOfOldStackArgs
     ppArg         = &((*ppArg)->NextRef());
@@ -17108,6 +17113,7 @@ void Compiler::fgMergeBlockReturn(BasicBlock* block)
     }
 }
 
+#ifndef TARGET_WASM
 /*****************************************************************************
  *
  *  Make some decisions about the kind of code to generate.
@@ -17218,6 +17224,7 @@ void Compiler::fgSetOptions()
 
     // printf("method will %s be fully interruptible\n", GetInterruptible() ? "   " : "not");
 }
+#endif // !TARGET_WASM
 
 /*****************************************************************************/
 
@@ -17908,7 +17915,9 @@ void Compiler::fgPromoteStructs()
     if (verbose)
     {
         printf("\nlvaTable before fgPromoteStructs\n");
+#ifndef TARGET_WASM
         lvaTableDump();
+#endif //!TARGET_WASM
     }
 #endif // DEBUG
 
@@ -17980,7 +17989,9 @@ void Compiler::fgPromoteStructs()
     if (verbose)
     {
         printf("\nlvaTable after fgPromoteStructs\n");
+#ifndef TARGET_WASM
         lvaTableDump();
+#endif //!TARGET_WASM
     }
 #endif // DEBUG
 }
diff --git a/src/coreclr/jit/regalloc.cpp b/src/coreclr/jit/regalloc.cpp
index 5e609b66d519..258b2b0cb2f6 100644
--- a/src/coreclr/jit/regalloc.cpp
+++ b/src/coreclr/jit/regalloc.cpp
@@ -109,6 +109,7 @@ bool Compiler::shouldDoubleAlign(unsigned             refCntStk,
 }
 #endif // DOUBLE_ALIGN
 
+#ifndef TARGET_WASM
 // The code to set the regState for each arg is outlined for shared use
 // by linear scan. (It is not shared for System V AMD64 platform.)
 regNumber Compiler::raUpdateRegStateForArg(RegState* regState, LclVarDsc* argDsc)
@@ -194,6 +195,7 @@ regNumber Compiler::raUpdateRegStateForArg(RegState* regState, LclVarDsc* argDsc
 
     return inArgReg;
 }
+#endif // !TARGET_WASM
 
 /****************************************************************************/
 /* Returns true when we must create an EBP frame
@@ -385,7 +387,11 @@ void Compiler::raMarkStkVars()
                                   // stack frame
 
     NOT_STK:;
+#ifndef TARGET_WASM
         varDsc->lvFramePointerBased = codeGen->isFramePointerUsed();
+#else
+        varDsc->lvFramePointerBased = false; // TODO Wasm sensible default?
+#endif
 
 #if DOUBLE_ALIGN
 
diff --git a/src/coreclr/jit/regset.cpp b/src/coreclr/jit/regset.cpp
index 1a7816d6ca25..f18206dc0499 100644
--- a/src/coreclr/jit/regset.cpp
+++ b/src/coreclr/jit/regset.cpp
@@ -189,6 +189,7 @@ void RegSet::rsRemoveRegsModified(regMaskTP mask)
 
 void RegSet::SetMaskVars(regMaskTP newMaskVars)
 {
+#ifndef TARGET_WASM
 #ifdef DEBUG
     if (m_rsCompiler->verbose)
     {
@@ -208,6 +209,7 @@ void RegSet::SetMaskVars(regMaskTP newMaskVars)
         printf("\n");
     }
 #endif // DEBUG
+#endif // !TARGET_WASM
 
     _rsMaskVars = newMaskVars;
 }
@@ -431,7 +433,11 @@ void RegSet::rsSpillTree(regNumber reg, GenTree* tree, unsigned regIdx /* =0 */)
     // Generate the code to spill the register
     var_types storeType = floatSpill ? treeType : tempType;
 
+#ifndef TARGET_WASM
     m_rsCompiler->codeGen->spillReg(storeType, temp, reg);
+#else
+    assert(false); // TODO
+#endif // !TARGET_WASM
 
     // Mark the tree node as having been spilled
     rsMarkSpill(tree, reg);
diff --git a/src/coreclr/jit/scopeinfo.cpp b/src/coreclr/jit/scopeinfo.cpp
index 97dc968becb4..5f7b09de0974 100644
--- a/src/coreclr/jit/scopeinfo.cpp
+++ b/src/coreclr/jit/scopeinfo.cpp
@@ -48,6 +48,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  *
  ******************************************************************************
  */
+#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -1881,3 +1882,4 @@ void CodeGen::psiMoveToStack(unsigned varNum)
 #endif // ACCURATE_PROLOG_DEBUG_INFO
 }
 #endif // USING_SCOPE_INFO
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp
index 361a4faadf53..cae8de063bc9 100644
--- a/src/coreclr/jit/stacklevelsetter.cpp
+++ b/src/coreclr/jit/stacklevelsetter.cpp
@@ -19,8 +19,12 @@ StackLevelSetter::StackLevelSetter(Compiler* compiler)
     , throwHelperBlocksUsed(comp->fgUseThrowHelperBlocks() && comp->compUsesThrowHelper)
 #endif // !FEATURE_FIXED_OUT_ARGS
 {
+#ifndef TARGET_WASM
     // The constructor reads this value to skip iterations that could set it if it is already set.
     compiler->codeGen->resetWritePhaseForFramePointerRequired();
+#else
+    assert(false); // Wasm - TODO can this be ignored?
+#endif // !TARGET_WASM
 }
 
 //------------------------------------------------------------------------
@@ -334,7 +338,11 @@ void StackLevelSetter::CheckArgCnt()
             printf("Too many pushed arguments for an ESP based encoding, forcing an EBP frame\n");
         }
 #endif
+#ifndef TARGET_WASM
         comp->codeGen->setFramePointerRequired(true);
+#else
+        assert(false); // Wasm - TODO can this be ignored?
+#endif // !TARGET_WASM
     }
 }
 
diff --git a/src/coreclr/jit/treelifeupdater.cpp b/src/coreclr/jit/treelifeupdater.cpp
index d81ce887218d..a170a8eb53f3 100644
--- a/src/coreclr/jit/treelifeupdater.cpp
+++ b/src/coreclr/jit/treelifeupdater.cpp
@@ -56,6 +56,7 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
 
     if (isBorn || isDying)
     {
+#ifndef TARGET_WASM
         if (ForCodeGen)
         {
             regNumber reg     = lclNode->GetRegNumByIdx(multiRegIndex);
@@ -70,6 +71,8 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
                 compiler->codeGen->genUpdateRegLife(fldVarDsc, isBorn, isDying DEBUGARG(lclNode));
             }
         }
+#endif // !TARGET_WASM
+
         // First, update the live set
         if (isDying)
         {
@@ -96,6 +99,7 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
 
         VarSetOps::Assign(compiler, compiler->compCurLife, newLife);
 
+#ifndef TARGET_WASM
         if (ForCodeGen)
         {
             // Only add vars to the gcInfo.gcVarPtrSetCur if they are currently on stack, since the
@@ -122,6 +126,7 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
                     VarSetOps::RemoveElemD(compiler, compiler->codeGen->gcInfo.gcVarPtrSetCur, fldVarIndex);
                 }
 
+#ifndef TARGET_WASM
 #ifdef DEBUG
                 if (compiler->verbose)
                 {
@@ -129,6 +134,7 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
                     printf("\n");
                 }
 #endif // DEBUG
+#endif // !TARGET_WASM
             }
 
 #ifdef USING_VARIABLE_LIVE_RANGE
@@ -141,8 +147,10 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
             compiler->codeGen->siUpdate();
 #endif // USING_SCOPE_INFO
         }
+#endif // !TARGET_WASM
     }
 
+#ifndef TARGET_WASM
     if (ForCodeGen && spill)
     {
         if (VarSetOps::IsMember(compiler, compiler->codeGen->gcInfo.gcTrkStkPtrLcls, fldVarIndex))
@@ -160,6 +168,7 @@ bool TreeLifeUpdater<ForCodeGen>::UpdateLifeFieldVar(GenTreeLclVar* lclNode, uns
         }
         return true;
     }
+#endif // !TARGET_WASM
     return false;
 }
 
@@ -252,6 +261,7 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
         if (varDsc->lvTracked)
         {
             VarSetOps::AddElemD(compiler, varDeltaSet, varDsc->lvVarIndex);
+#ifndef TARGET_WASM
             if (ForCodeGen)
             {
                 if (isBorn && varDsc->lvIsRegCandidate() && tree->gtHasReg())
@@ -269,7 +279,9 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
                     VarSetOps::AddElemD(compiler, stackVarDeltaSet, varDsc->lvVarIndex);
                 }
             }
+#endif // !TARGET_WASM
         }
+#ifndef TARGET_WASM
         else if (ForCodeGen && lclVarTree->IsMultiRegLclVar())
         {
             assert(varDsc->lvPromoted && compiler->lvaEnregMultiRegVars);
@@ -306,6 +318,7 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
             }
             spill = false;
         }
+#endif // !TARGET_WASM
         else if (varDsc->lvPromoted)
         {
             // If hasDeadTrackedFieldVars is true, then, for a LDOBJ(ADDR(<promoted struct local>)),
@@ -392,6 +405,7 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
 
         VarSetOps::Assign(compiler, compiler->compCurLife, newLife);
 
+#ifndef TARGET_WASM
         if (ForCodeGen)
         {
             // Only add vars to the gcInfo.gcVarPtrSetCur if they are currently on stack, since the
@@ -437,8 +451,10 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
             compiler->codeGen->siUpdate();
 #endif // USING_SCOPE_INFO
         }
+#endif // !TARGET_WASM
     }
 
+#ifndef TARGET_WASM
     if (ForCodeGen && spill)
     {
         assert(!varDsc->lvPromoted);
@@ -457,6 +473,7 @@ void TreeLifeUpdater<ForCodeGen>::UpdateLifeVar(GenTree* tree)
             }
         }
     }
+#endif // !TARGET_WASM
 }
 
 //------------------------------------------------------------------------
diff --git a/src/coreclr/jit/unwind.cpp b/src/coreclr/jit/unwind.cpp
index c8db0dc94498..f8ace4acfd40 100644
--- a/src/coreclr/jit/unwind.cpp
+++ b/src/coreclr/jit/unwind.cpp
@@ -15,6 +15,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #pragma hdrstop
 #endif
 
+#ifndef TARGET_WASM
 #if defined(FEATURE_EH_FUNCLETS)
 
 //------------------------------------------------------------------------
@@ -116,6 +117,7 @@ void Compiler::unwindGetFuncLocations(FuncInfoDsc*             func,
 }
 
 #endif // FEATURE_EH_FUNCLETS
+#endif // !TARGET_WASM
 
 #if defined(TARGET_UNIX)
 
@@ -378,6 +380,7 @@ void Compiler::DumpCfiInfo(bool                  isHotCode,
 
 #endif // TARGET_UNIX
 
+#ifndef TARGET_WASM
 //------------------------------------------------------------------------
 // Compiler::unwindGetCurrentOffset: Calculate the current byte offset of the
 // prolog being generated.
@@ -408,6 +411,7 @@ UNATIVE_OFFSET Compiler::unwindGetCurrentOffset(FuncInfoDsc* func)
 
     return offset;
 }
+#endif // !TARGET_WASM
 
 #if defined(TARGET_AMD64)
 
@@ -425,7 +429,7 @@ UNATIVE_OFFSET Compiler::unwindGetCurrentOffset(FuncInfoDsc* func)
 
 // See unwindX86.cpp
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO
+#elif defined(TARGET_WASM) // TODO
 
 #else // TARGET*
 

From 8139539163dc6435aa72e22ba0561f64db6fa96a Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Feb 2021 21:28:56 -0500
Subject: [PATCH 08/44] Remove lowering/emit/lrsa/codegen by #ifdefs

Some code alerts false where not sure what to do.

Passes non-wasm tests locally.
---
 src/coreclr/inc/cordebuginfo.h          |    2 +-
 src/coreclr/inc/corinfoinstructionset.h |   36 -
 src/coreclr/inc/switches.h              |    2 +-
 src/coreclr/jit/importer.cpp            |    2 +-
 src/coreclr/jit/instr.h                 |    4 +-
 src/coreclr/jit/instrs.h                |    2 +-
 src/coreclr/jit/jit.h                   |    2 +-
 src/coreclr/jit/lowerwasm.cpp           | 4907 -----------------------
 src/coreclr/jit/lsrawasm.cpp            | 1528 -------
 src/coreclr/jit/register.h              |    6 +-
 src/coreclr/jit/simd.h                  |    2 +-
 src/coreclr/jit/target.h                |   14 +-
 src/coreclr/jit/targetwasm.cpp          |    2 +-
 src/coreclr/jit/unwindwasm.cpp          |    4 +-
 src/coreclr/jit/utils.cpp               |    2 +-
 src/coreclr/jit/valuenum.cpp            |    4 +-
 src/coreclr/jit/valuenumfuncs.h         |    2 +-
 17 files changed, 25 insertions(+), 6496 deletions(-)
 delete mode 100644 src/coreclr/jit/lowerwasm.cpp
 delete mode 100644 src/coreclr/jit/lsrawasm.cpp

diff --git a/src/coreclr/inc/cordebuginfo.h b/src/coreclr/inc/cordebuginfo.h
index 466c972dc7b6..e70ee34f28e0 100644
--- a/src/coreclr/inc/cordebuginfo.h
+++ b/src/coreclr/inc/cordebuginfo.h
@@ -145,7 +145,7 @@ class ICorDebugInfo
         REGNUM_R13,
         REGNUM_R14,
         REGNUM_R15,
-#elif TARGET_WASM32 || TARGET_WASM64
+#elif TARGET_WASM
 // TODO????
 #else
         PORTABILITY_WARNING("Register numbers not defined on this platform")
diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h
index f1e654ed4a27..dcb058593920 100644
--- a/src/coreclr/inc/corinfoinstructionset.h
+++ b/src/coreclr/inc/corinfoinstructionset.h
@@ -108,42 +108,6 @@ enum CORINFO_InstructionSet
     InstructionSet_PCLMULQDQ_X64=33,
     InstructionSet_POPCNT_X64=34,
 #endif // TARGET_X86
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
-    //InstructionSet_X86Base = 1,
-    //InstructionSet_SSE = 2,
-    //InstructionSet_SSE2 = 3,
-    //InstructionSet_SSE3 = 4,
-    //InstructionSet_SSSE3 = 5,
-    //InstructionSet_SSE41 = 6,
-    InstructionSet_SSE42 = 7,
-    //InstructionSet_AVX = 8,
-    InstructionSet_AVX2 = 9,
-    //InstructionSet_AES = 10,
-    //InstructionSet_BMI1 = 11,
-    //InstructionSet_BMI2 = 12,
-    //InstructionSet_FMA = 13,
-    //InstructionSet_LZCNT = 14,
-    //InstructionSet_PCLMULQDQ = 15,
-    //InstructionSet_POPCNT = 16,
-    //InstructionSet_Vector128 = 17,
-    //InstructionSet_Vector256 = 18,
-    //InstructionSet_X86Base_X64 = 19,
-    //InstructionSet_SSE_X64 = 20,
-    //InstructionSet_SSE2_X64 = 21,
-    //InstructionSet_SSE3_X64 = 22,
-    //InstructionSet_SSSE3_X64 = 23,
-    //InstructionSet_SSE41_X64 = 24,
-    //InstructionSet_SSE42_X64 = 25,
-    //InstructionSet_AVX_X64 = 26,
-    //InstructionSet_AVX2_X64 = 27,
-    //InstructionSet_AES_X64 = 28,
-    //InstructionSet_BMI1_X64 = 29,
-    //InstructionSet_BMI2_X64 = 30,
-    //InstructionSet_FMA_X64 = 31,
-    //InstructionSet_LZCNT_X64 = 32,
-    //InstructionSet_PCLMULQDQ_X64 = 33,
-    //InstructionSet_POPCNT_X64 = 34,
-#endif // TARGET_AMD64
 };
 
 struct CORINFO_InstructionSetFlags
diff --git a/src/coreclr/inc/switches.h b/src/coreclr/inc/switches.h
index 7a699abfd3c6..51a55c0a839a 100644
--- a/src/coreclr/inc/switches.h
+++ b/src/coreclr/inc/switches.h
@@ -65,7 +65,7 @@
     #define USE_UPPER_ADDRESS       0
 #endif // !HOST_UNIX
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     #define USE_UPPER_ADDRESS       0 // TODO : what's this?
 #else
     #error Please add a new #elif clause and define all portability macros for the new platform
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index eec01b573e51..c138d92d330a 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -3783,7 +3783,7 @@ GenTree* Compiler::impIntrinsic(GenTree*                newobjThis,
         GenTree* op1;
         GenTree* op2;
 
-#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_WASM) // TODO Wasm
         // TODO-ARM-CQ: reenable treating Interlocked operation as intrinsic
 
         // Note that CORINFO_INTRINSIC_InterlockedAdd32/64 are not actually used.
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index dac28eef3436..862616b28860 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -47,7 +47,7 @@ enum instruction : unsigned
 
     INS_lea,   // Not a real instruction. It is used for load the address of stack locals
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #define INST0(id, nm, um, mr,                 flags) INS_##id,
 #define INST1(id, nm, um, mr,                 flags) INS_##id,
 #define INST2(id, nm, um, mr, mi,             flags) INS_##id,
@@ -114,7 +114,7 @@ enum insFlags: unsigned
     INS_FLAGS_SET = 0x01,
     INS_FLAGS_DONT_CARE = 0x02,
 };
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO : can this be removed/empty?
+#elif defined(TARGET_WASM) // TODO : can this be removed/empty?
 enum insFlags: uint8_t
 {
     INS_FLAGS_None = 0x00,
diff --git a/src/coreclr/jit/instrs.h b/src/coreclr/jit/instrs.h
index 790984e94f85..4e9b1b1f648f 100644
--- a/src/coreclr/jit/instrs.h
+++ b/src/coreclr/jit/instrs.h
@@ -7,7 +7,7 @@
 #include "instrsarm.h"
 #elif defined(TARGET_ARM64)
 #include "instrsarm64.h"
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #include "instrswasm.h"
 #else
 #error Unsupported or unset target architecture
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index e4d4ac9360aa..c45472b1ee19 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -195,7 +195,7 @@
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARMNT
 #elif defined(TARGET_ARM64)
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARM64 // 0xAA64
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_AMD64 // TODO: what is this?
 #else
 #error Unsupported or unset target architecture
diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp
deleted file mode 100644
index 4efd683c830c..000000000000
--- a/src/coreclr/jit/lowerwasm.cpp
+++ /dev/null
@@ -1,4907 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XX                                                                           XX
-XX                           Lowering for AMD64, x86                         XX
-XX                                                                           XX
-XX  This encapsulates all the logic for lowering trees for the AMD64         XX
-XX  architecture.  For a more detailed view of what is lowering, please      XX
-XX  take a look at Lower.cpp                                                 XX
-XX                                                                           XX
-XX                                                                           XX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-*/
-
-#include "jitpch.h"
-#ifdef _MSC_VER
-#pragma hdrstop
-#endif
-
-#if defined (TARGET_WASM32) || defined(TARGET_WASM64) // This file is only used for wasm
-
-#include "jit.h"
-#include "sideeffects.h"
-#include "lower.h"
-
-// xarch supports both ROL and ROR instructions so no lowering is required.
-void Lowering::LowerRotate(GenTree* tree)
-{
-    ContainCheckShiftRotate(tree->AsOp());
-}
-
-//------------------------------------------------------------------------
-// LowerStoreLoc: Lower a store of a lclVar
-//
-// Arguments:
-//    storeLoc - the local store (GT_STORE_LCL_FLD or GT_STORE_LCL_VAR)
-//
-// Notes:
-//    This involves:
-//    - Handling of contained immediates.
-//    - Widening operations of unsigneds.
-
-void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc)
-{
-    // Try to widen the ops if they are going into a local var.
-    if ((storeLoc->gtOper == GT_STORE_LCL_VAR) && (storeLoc->gtOp1->gtOper == GT_CNS_INT))
-    {
-        GenTreeIntCon* con  = storeLoc->gtOp1->AsIntCon();
-        ssize_t        ival = con->gtIconVal;
-
-        unsigned   varNum = storeLoc->GetLclNum();
-        LclVarDsc* varDsc = comp->lvaTable + varNum;
-
-        if (varDsc->lvIsSIMDType())
-        {
-            noway_assert(storeLoc->gtType != TYP_STRUCT);
-        }
-        unsigned size = genTypeSize(storeLoc);
-        // If we are storing a constant into a local variable
-        // we extend the size of the store here
-        if ((size < 4) && !varTypeIsStruct(varDsc))
-        {
-            if (!varTypeIsUnsigned(varDsc))
-            {
-                if (genTypeSize(storeLoc) == 1)
-                {
-                    if ((ival & 0x7f) != ival)
-                    {
-                        ival = ival | 0xffffff00;
-                    }
-                }
-                else
-                {
-                    assert(genTypeSize(storeLoc) == 2);
-                    if ((ival & 0x7fff) != ival)
-                    {
-                        ival = ival | 0xffff0000;
-                    }
-                }
-            }
-
-            // A local stack slot is at least 4 bytes in size, regardless of
-            // what the local var is typed as, so auto-promote it here
-            // unless it is a field of a promoted struct
-            // TODO-XArch-CQ: if the field is promoted shouldn't we also be able to do this?
-            if (!varDsc->lvIsStructField)
-            {
-                storeLoc->gtType = TYP_INT;
-                con->SetIconValue(ival);
-            }
-        }
-    }
-    if (storeLoc->OperIs(GT_STORE_LCL_FLD))
-    {
-        // We should only encounter this for lclVars that are lvDoNotEnregister.
-        verifyLclFldDoNotEnregister(storeLoc->GetLclNum());
-    }
-    ContainCheckStoreLoc(storeLoc);
-}
-
-//------------------------------------------------------------------------
-// LowerStoreIndir: Determine addressing mode for an indirection, and whether operands are contained.
-//
-// Arguments:
-//    node       - The indirect store node (GT_STORE_IND) of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::LowerStoreIndir(GenTreeIndir* node)
-{
-    // Mark all GT_STOREIND nodes to indicate that it is not known
-    // whether it represents a RMW memory op.
-    node->AsStoreInd()->SetRMWStatusDefault();
-
-    if (!varTypeIsFloating(node))
-    {
-        // Perform recognition of trees with the following structure:
-        //        StoreInd(addr, BinOp(expr, GT_IND(addr)))
-        // to be able to fold this into an instruction of the form
-        //        BINOP [addr], register
-        // where register is the actual place where 'expr' is computed.
-        //
-        // SSE2 doesn't support RMW form of instructions.
-        if (LowerRMWMemOp(node))
-        {
-            return;
-        }
-    }
-    ContainCheckStoreIndir(node);
-}
-
-//------------------------------------------------------------------------
-// LowerBlockStore: Lower a block store node
-//
-// Arguments:
-//    blkNode - The block store node to lower
-//
-void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// ContainBlockStoreAddress: Attempt to contain an address used by an unrolled block store.
-//
-// Arguments:
-//    blkNode - the block store node
-//    size - the block size
-//    addr - the address node to try to contain
-//
-void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenTree* addr)
-{
-    assert(blkNode->OperIs(GT_STORE_BLK) && (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll));
-    assert(size < INT32_MAX);
-
-    if (addr->OperIsLocalAddr())
-    {
-        addr->SetContained();
-        return;
-    }
-
-    if (!addr->OperIsAddrMode() && !TryCreateAddrMode(addr, true))
-    {
-        return;
-    }
-
-    GenTreeAddrMode* addrMode = addr->AsAddrMode();
-
-    // On x64 the address mode displacement is signed so it must not exceed INT32_MAX. This check is
-    // an approximation since the last displacement we generate in an unrolled block operation can be
-    // up to 16 bytes lower than offset + size. But offsets large enough to hit this case are likely
-    // to be extremely rare for this to ever be a CQ issue.
-    // On x86 this shouldn't be needed but then again, offsets large enough to hit this are rare.
-    if (addrMode->Offset() > (INT32_MAX - static_cast<int>(size)))
-    {
-        return;
-    }
-
-    // Note that the parentNode is always the block node, even if we're dealing with the source address.
-    // The source address is not directly used by the block node but by an IND node and that IND node is
-    // always contained.
-    if (!IsSafeToContainMem(blkNode, addrMode))
-    {
-        return;
-    }
-
-    addrMode->SetContained();
-}
-
-//------------------------------------------------------------------------
-// LowerPutArgStk: Lower a GT_PUTARG_STK.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
-{
-    GenTree* src = putArgStk->gtGetOp1();
-
-    if (src->OperIs(GT_FIELD_LIST))
-    {
-#ifdef TARGET_X86
-        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Invalid;
-
-        GenTreeFieldList* fieldList = src->AsFieldList();
-
-        // The code generator will push these fields in reverse order by offset. Reorder the list here s.t. the order
-        // of uses is visible to LSRA.
-        assert(fieldList->Uses().IsSorted());
-        fieldList->Uses().Reverse();
-
-        // Now that the fields have been sorted, the kind of code we will generate.
-        bool     allFieldsAreSlots = true;
-        unsigned prevOffset        = putArgStk->GetStackByteSize();
-        for (GenTreeFieldList::Use& use : fieldList->Uses())
-        {
-            GenTree* const  fieldNode   = use.GetNode();
-            const var_types fieldType   = fieldNode->TypeGet();
-            const unsigned  fieldOffset = use.GetOffset();
-            assert(fieldType != TYP_LONG);
-
-            // We can treat as a slot any field that is stored at a slot boundary, where the previous
-            // field is not in the same slot. (Note that we store the fields in reverse order.)
-            const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4);
-            if (!fieldIsSlot)
-            {
-                allFieldsAreSlots = false;
-            }
-
-            // For x86 we must mark all integral fields as contained or reg-optional, and handle them
-            // accordingly in code generation, since we may have up to 8 fields, which cannot all be in
-            // registers to be consumed atomically by the call.
-            if (varTypeIsIntegralOrI(fieldNode))
-            {
-                if (fieldNode->OperGet() == GT_LCL_VAR)
-                {
-                    LclVarDsc* varDsc = &(comp->lvaTable[fieldNode->AsLclVarCommon()->GetLclNum()]);
-                    if (!varDsc->lvDoNotEnregister)
-                    {
-                        fieldNode->SetRegOptional();
-                    }
-                    else
-                    {
-                        MakeSrcContained(putArgStk, fieldNode);
-                    }
-                }
-                else if (fieldNode->IsIntCnsFitsInI32())
-                {
-                    MakeSrcContained(putArgStk, fieldNode);
-                }
-                else
-                {
-                    // For the case where we cannot directly push the value, if we run out of registers,
-                    // it would be better to defer computation until we are pushing the arguments rather
-                    // than spilling, but this situation is not all that common, as most cases of promoted
-                    // structs do not have a large number of fields, and of those most are lclVars or
-                    // copy-propagated constants.
-                    fieldNode->SetRegOptional();
-                }
-            }
-
-            prevOffset = fieldOffset;
-        }
-
-        // Set the copy kind.
-        // TODO-X86-CQ: Even if we are using push, if there are contiguous floating point fields, we should
-        // adjust the stack once for those fields. The latter is really best done in code generation, but
-        // this tuning should probably be undertaken as a whole.
-        // Also, if there are  floating point fields, it may be better to use the "Unroll" mode
-        // of copying the struct as a whole, if the fields are not register candidates.
-        if (allFieldsAreSlots)
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PushAllSlots;
-        }
-        else
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-        }
-#endif // TARGET_X86
-        return;
-    }
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-    if (src->TypeGet() != TYP_STRUCT)
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-    {
-        // If the child of GT_PUTARG_STK is a constant, we don't need a register to
-        // move it to memory (stack location).
-        //
-        // On AMD64, we don't want to make 0 contained, because we can generate smaller code
-        // by zeroing a register and then storing it. E.g.:
-        //      xor rdx, rdx
-        //      mov gword ptr [rsp+28H], rdx
-        // is 2 bytes smaller than:
-        //      mov gword ptr [rsp+28H], 0
-        //
-        // On x86, we push stack arguments; we don't use 'mov'. So:
-        //      push 0
-        // is 1 byte smaller than:
-        //      xor rdx, rdx
-        //      push rdx
-
-        if (IsContainableImmed(putArgStk, src)
-#if defined(TARGET_AMD64)
-            && !src->IsIntegralConst(0)
-#endif // TARGET_AMD64
-                )
-        {
-            MakeSrcContained(putArgStk, src);
-        }
-        return;
-    }
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-    GenTree* srcAddr = nullptr;
-
-    bool haveLocalAddr = false;
-    if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
-    {
-        srcAddr = src->AsOp()->gtOp1;
-        assert(srcAddr != nullptr);
-        haveLocalAddr = srcAddr->OperIsLocalAddr();
-    }
-    else
-    {
-        assert(varTypeIsSIMD(putArgStk));
-    }
-
-    ClassLayout* layout = src->AsObj()->GetLayout();
-
-    // In case of a CpBlk we could use a helper call. In case of putarg_stk we
-    // can't do that since the helper call could kill some already set up outgoing args.
-    // TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
-    // The cpyXXXX code is rather complex and this could cause it to be more complex, but
-    // it might be the right thing to do.
-
-    unsigned size = putArgStk->GetStackByteSize();
-
-    // TODO-X86-CQ: The helper call either is not supported on x86 or required more work
-    // (I don't know which).
-
-    if (size <= CPBLK_UNROLL_LIMIT && !layout->HasGCPtr())
-    {
-#ifdef TARGET_X86
-        if (size < XMM_REGSIZE_BYTES)
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-        }
-        else
-#endif // TARGET_X86
-        {
-            putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
-        }
-    }
-#ifdef TARGET_X86
-    else if (layout->HasGCPtr())
-    {
-        // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
-        // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
-        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
-    }
-#endif // TARGET_X86
-    else
-    {
-        putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
-    }
-    // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
-    MakeSrcContained(putArgStk, src);
-    if (haveLocalAddr)
-    {
-        // If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
-        // copies.
-        //
-        MakeSrcContained(putArgStk, srcAddr);
-    }
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-}
-
-/* Lower GT_CAST(srcType, DstType) nodes.
- *
- * Casts from small int type to float/double are transformed as follows:
- * GT_CAST(byte, float/double)     =   GT_CAST(GT_CAST(byte, int32), float/double)
- * GT_CAST(sbyte, float/double)    =   GT_CAST(GT_CAST(sbyte, int32), float/double)
- * GT_CAST(int16, float/double)    =   GT_CAST(GT_CAST(int16, int32), float/double)
- * GT_CAST(uint16, float/double)   =   GT_CAST(GT_CAST(uint16, int32), float/double)
- *
- * SSE2 conversion instructions operate on signed integers. casts from Uint32/Uint64
- * are morphed as follows by front-end and hence should not be seen here.
- * GT_CAST(uint32, float/double)   =   GT_CAST(GT_CAST(uint32, long), float/double)
- * GT_CAST(uint64, float)          =   GT_CAST(GT_CAST(uint64, double), float)
- *
- *
- * Similarly casts from float/double to a smaller int type are transformed as follows:
- * GT_CAST(float/double, byte)     =   GT_CAST(GT_CAST(float/double, int32), byte)
- * GT_CAST(float/double, sbyte)    =   GT_CAST(GT_CAST(float/double, int32), sbyte)
- * GT_CAST(float/double, int16)    =   GT_CAST(GT_CAST(double/double, int32), int16)
- * GT_CAST(float/double, uint16)   =   GT_CAST(GT_CAST(double/double, int32), uint16)
- *
- * SSE2 has instructions to convert a float/double vlaue into a signed 32/64-bit
- * integer.  The above transformations help us to leverage those instructions.
- *
- * Note that for the following conversions we still depend on helper calls and
- * don't expect to see them here.
- *  i) GT_CAST(float/double, uint64)
- * ii) GT_CAST(float/double, int type with overflow detection)
- *
- * TODO-XArch-CQ: (Low-pri): Jit64 generates in-line code of 8 instructions for (i) above.
- * There are hardly any occurrences of this conversion operation in platform
- * assemblies or in CQ perf benchmarks (1 occurrence in corelib, microsoft.jscript,
- * 1 occurrence in Roslyn and no occurrences in system, system.core, system.numerics
- * system.windows.forms, scimark, fractals, bio mums). If we ever find evidence that
- * doing this optimization is a win, should consider generating in-lined code.
- */
-void Lowering::LowerCast(GenTree* tree)
-{
-    assert(tree->OperGet() == GT_CAST);
-
-    GenTree*  castOp     = tree->AsCast()->CastOp();
-    var_types castToType = tree->CastToType();
-    var_types srcType    = castOp->TypeGet();
-    var_types tmpType    = TYP_UNDEF;
-
-    // force the srcType to unsigned if GT_UNSIGNED flag is set
-    if (tree->gtFlags & GTF_UNSIGNED)
-    {
-        srcType = genUnsignedType(srcType);
-    }
-
-    // We should never see the following casts as they are expected to be lowered
-    // apropriately or converted into helper calls by front-end.
-    //   srcType = float/double                    castToType = * and overflow detecting cast
-    //       Reason: must be converted to a helper call
-    //   srcType = float/double,                   castToType = ulong
-    //       Reason: must be converted to a helper call
-    //   srcType = uint                            castToType = float/double
-    //       Reason: uint -> float/double = uint -> long -> float/double
-    //   srcType = ulong                           castToType = float
-    //       Reason: ulong -> float = ulong -> double -> float
-    if (varTypeIsFloating(srcType))
-    {
-        noway_assert(!tree->gtOverflow());
-        noway_assert(castToType != TYP_ULONG);
-    }
-    else if (srcType == TYP_UINT)
-    {
-        noway_assert(!varTypeIsFloating(castToType));
-    }
-    else if (srcType == TYP_ULONG)
-    {
-        noway_assert(castToType != TYP_FLOAT);
-    }
-
-    // Case of src is a small type and dst is a floating point type.
-    if (varTypeIsSmall(srcType) && varTypeIsFloating(castToType))
-    {
-        // These conversions can never be overflow detecting ones.
-        noway_assert(!tree->gtOverflow());
-        tmpType = TYP_INT;
-    }
-    // case of src is a floating point type and dst is a small type.
-    else if (varTypeIsFloating(srcType) && varTypeIsSmall(castToType))
-    {
-        tmpType = TYP_INT;
-    }
-
-    if (tmpType != TYP_UNDEF)
-    {
-        GenTree* tmp = comp->gtNewCastNode(tmpType, castOp, tree->IsUnsigned(), tmpType);
-        tmp->gtFlags |= (tree->gtFlags & (GTF_OVERFLOW | GTF_EXCEPT));
-
-        tree->gtFlags &= ~GTF_UNSIGNED;
-        tree->AsOp()->gtOp1 = tmp;
-        BlockRange().InsertAfter(castOp, tmp);
-        ContainCheckCast(tmp->AsCast());
-    }
-
-    // Now determine if we have operands that should be contained.
-    ContainCheckCast(tree->AsCast());
-}
-
-#ifdef FEATURE_SIMD
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerSIMD: Perform containment analysis for a SIMD intrinsic node.
-//
-//  Arguments:
-//     simdNode - The SIMD intrinsic node.
-//
-void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
-{
-    if (simdNode->TypeGet() == TYP_SIMD12)
-    {
-        // GT_SIMD node requiring to produce TYP_SIMD12 in fact
-        // produces a TYP_SIMD16 result
-        simdNode->gtType = TYP_SIMD16;
-    }
-
-    if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicInitN)
-    {
-        assert(simdNode->gtSIMDBaseType == TYP_FLOAT);
-
-        int   argCount      = 0;
-        int   constArgCount = 0;
-        float constArgValues[4]{0, 0, 0, 0};
-
-        for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
-        {
-            GenTree* arg = list->Current();
-
-            assert(arg->TypeGet() == simdNode->gtSIMDBaseType);
-            assert(argCount < (int)_countof(constArgValues));
-
-            if (arg->IsCnsFltOrDbl())
-            {
-                constArgValues[constArgCount] = static_cast<float>(arg->AsDblCon()->gtDconVal);
-                constArgCount++;
-            }
-
-            argCount++;
-        }
-
-        if (constArgCount == argCount)
-        {
-            for (GenTreeArgList* list = simdNode->gtGetOp1()->AsArgList(); list != nullptr; list = list->Rest())
-            {
-                BlockRange().Remove(list->Current());
-            }
-
-            assert(sizeof(constArgValues) == 16);
-
-            unsigned cnsSize  = sizeof(constArgValues);
-            unsigned cnsAlign = (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : 1;
-
-            CORINFO_FIELD_HANDLE hnd =
-                comp->GetEmitter()->emitBlkConst(constArgValues, cnsSize, cnsAlign, simdNode->gtSIMDBaseType);
-            GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
-            BlockRange().InsertBefore(simdNode, clsVarAddr);
-            simdNode->ChangeOper(GT_IND);
-            simdNode->gtOp1 = clsVarAddr;
-            ContainCheckIndir(simdNode->AsIndir());
-
-            return;
-        }
-    }
-
-#ifdef TARGET_XARCH
-    if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
-    {
-        // If SIMD vector is already in memory, we force its
-        // addr to be evaluated into a reg.  This would allow
-        // us to generate [regBase] or [regBase+offset] or
-        // [regBase+sizeOf(SIMD vector baseType)*regIndex]
-        // to access the required SIMD vector element directly
-        // from memory.
-        //
-        // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
-        // might be able update GT_LEA to fold the regIndex
-        // or offset in some cases.  Instead with this
-        // approach we always evaluate GT_LEA into a reg.
-        // Ideally, we should be able to lower GetItem intrinsic
-        // into GT_IND(newAddr) where newAddr combines
-        // the addr of SIMD vector with the given index.
-        simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
-    }
-#endif
-    ContainCheckSIMD(simdNode);
-}
-#endif // FEATURE_SIMD
-
-#ifdef FEATURE_HW_INTRINSICS
-
-//----------------------------------------------------------------------------------------------
-// LowerHWIntrinsicCC: Lowers a hardware intrinsic node that produces a boolean value by
-//     setting the condition flags.
-//
-//  Arguments:
-//     node - The hardware intrinsic node
-//     newIntrinsicId - The intrinsic id of the lowered intrinsic node
-//     condition - The condition code of the generated SETCC/JCC node
-//
-void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition)
-{
-    GenTreeCC* cc = LowerNodeCC(node, condition);
-
-    node->gtHWIntrinsicId = newIntrinsicId;
-    node->gtType          = TYP_VOID;
-    node->ClearUnusedValue();
-
-    bool swapOperands    = false;
-    bool canSwapOperands = false;
-
-    switch (newIntrinsicId)
-    {
-        case NI_SSE_COMISS:
-        case NI_SSE_UCOMISS:
-        case NI_SSE2_COMISD:
-        case NI_SSE2_UCOMISD:
-            // In some cases we can generate better code if we swap the operands:
-            //   - If the condition is not one of the "preferred" floating point conditions we can swap
-            //     the operands and change the condition to avoid generating an extra JP/JNP branch.
-            //   - If the first operand can be contained but the second cannot, we can swap operands in
-            //     order to be able to contain the first operand and avoid the need for a temp reg.
-            // We can't handle both situations at the same time and since an extra branch is likely to
-            // be worse than an extra temp reg (x64 has a reasonable number of XMM registers) we'll favor
-            // the branch case:
-            //   - If the condition is not preferred then swap, even if doing this will later prevent
-            //     containment.
-            //   - Allow swapping for containment purposes only if this doesn't result in a non-"preferred"
-            //     condition being generated.
-            if ((cc != nullptr) && cc->gtCondition.PreferSwap())
-            {
-                swapOperands = true;
-            }
-            else
-            {
-                canSwapOperands = (cc == nullptr) || !GenCondition::Swap(cc->gtCondition).PreferSwap();
-            }
-            break;
-
-        case NI_SSE41_PTEST:
-        case NI_AVX_PTEST:
-            // If we need the Carry flag then we can't swap operands.
-            canSwapOperands = (cc == nullptr) || cc->gtCondition.Is(GenCondition::EQ, GenCondition::NE);
-            break;
-
-        default:
-            unreached();
-    }
-
-    if (canSwapOperands)
-    {
-        bool op1SupportsRegOptional = false;
-        bool op2SupportsRegOptional = false;
-
-        if (!IsContainableHWIntrinsicOp(node, node->gtGetOp2(), &op2SupportsRegOptional) &&
-            IsContainableHWIntrinsicOp(node, node->gtGetOp1(), &op1SupportsRegOptional))
-        {
-            // Swap operands if op2 cannot be contained but op1 can.
-            swapOperands = true;
-        }
-    }
-
-    if (swapOperands)
-    {
-        std::swap(node->gtOp1, node->gtOp2);
-
-        if (cc != nullptr)
-        {
-            cc->gtCondition = GenCondition::Swap(cc->gtCondition);
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------------------
-// LowerFusedMultiplyAdd: Changes NI_FMA_MultiplyAddScalar produced by Math(F).FusedMultiplyAdd
-//     to a better FMA intrinsics if there are GT_NEG around in order to eliminate them.
-//
-//  Arguments:
-//     node - The hardware intrinsic node
-//
-//  Notes:
-//     Math(F).FusedMultiplyAdd is expanded into NI_FMA_MultiplyAddScalar and
-//     depending on additional GT_NEG nodes around it can be:
-//
-//      x *  y + z -> NI_FMA_MultiplyAddScalar
-//      x * -y + z -> NI_FMA_MultiplyAddNegatedScalar
-//     -x *  y + z -> NI_FMA_MultiplyAddNegatedScalar
-//     -x * -y + z -> NI_FMA_MultiplyAddScalar
-//      x *  y - z -> NI_FMA_MultiplySubtractScalar
-//      x * -y - z -> NI_FMA_MultiplySubtractNegatedScalar
-//     -x *  y - z -> NI_FMA_MultiplySubtractNegatedScalar
-//     -x * -y - z -> NI_FMA_MultiplySubtractScalar
-//
-void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node)
-{
-    assert(node->gtHWIntrinsicId == NI_FMA_MultiplyAddScalar);
-    GenTreeArgList*     argList = node->gtGetOp1()->AsArgList();
-    GenTreeHWIntrinsic* createScalarOps[3];
-
-    for (GenTreeHWIntrinsic*& createScalarOp : createScalarOps)
-    {
-        GenTree*& current = argList->Current();
-        assert(current != nullptr);
-        if (!current->OperIsHWIntrinsic())
-        {
-            return; // Math(F).FusedMultiplyAdd is expected to emit three NI_Vector128_CreateScalarUnsafe
-                    // but it's also possible to use NI_FMA_MultiplyAddScalar directly with any operands
-        }
-        GenTreeHWIntrinsic* hwArg = current->AsHWIntrinsic();
-        if (hwArg->gtHWIntrinsicId != NI_Vector128_CreateScalarUnsafe)
-        {
-            return;
-        }
-        createScalarOp = hwArg;
-        argList        = argList->Rest();
-    }
-    assert(argList == nullptr);
-
-    GenTree* argX = createScalarOps[0]->gtGetOp1();
-    GenTree* argY = createScalarOps[1]->gtGetOp1();
-    GenTree* argZ = createScalarOps[2]->gtGetOp1();
-
-    const bool negMul = argX->OperIs(GT_NEG) != argY->OperIs(GT_NEG);
-    if (argX->OperIs(GT_NEG))
-    {
-        createScalarOps[0]->gtOp1 = argX->gtGetOp1();
-        BlockRange().Remove(argX);
-    }
-    if (argY->OperIs(GT_NEG))
-    {
-        createScalarOps[1]->gtOp1 = argY->gtGetOp1();
-        BlockRange().Remove(argY);
-    }
-    if (argZ->OperIs(GT_NEG))
-    {
-        createScalarOps[2]->gtOp1 = argZ->gtGetOp1();
-        BlockRange().Remove(argZ);
-        node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplySubtractNegatedScalar : NI_FMA_MultiplySubtractScalar;
-    }
-    else
-    {
-        node->gtHWIntrinsicId = negMul ? NI_FMA_MultiplyAddNegatedScalar : NI_FMA_MultiplyAddScalar;
-    }
-}
-
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
-//
-//  Arguments:
-//     node - The hardware intrinsic node.
-//
-void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
-{
-    if (node->TypeGet() == TYP_SIMD12)
-    {
-        // GT_HWINTRINSIC node requiring to produce TYP_SIMD12 in fact
-        // produces a TYP_SIMD16 result
-        node->gtType = TYP_SIMD16;
-    }
-
-    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
-
-    switch (intrinsicId)
-    {
-        case NI_Vector128_Create:
-        case NI_Vector256_Create:
-        {
-            // We don't directly support the Vector128.Create or Vector256.Create methods in codegen
-            // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect
-            // that the node is modified to either not be a HWIntrinsic node or that it is no longer
-            // the same intrinsic as when it came in. In the case of Vector256.Create, we may lower
-            // it into 2x Vector128.Create intrinsics which themselves are also lowered into other
-            // intrinsics that are not Vector*.Create
-
-            LowerHWIntrinsicCreate(node);
-            assert(!node->OperIsHWIntrinsic() || (node->gtHWIntrinsicId != intrinsicId));
-            LowerNode(node);
-            return;
-        }
-
-        case NI_Vector128_Dot:
-        case NI_Vector256_Dot:
-        {
-            LowerHWIntrinsicDot(node);
-            return;
-        }
-
-        case NI_Vector128_op_Equality:
-        case NI_Vector256_op_Equality:
-        {
-            LowerHWIntrinsicCmpOp(node, GT_EQ);
-            return;
-        }
-
-        case NI_Vector128_op_Inequality:
-        case NI_Vector256_op_Inequality:
-        {
-            LowerHWIntrinsicCmpOp(node, GT_NE);
-            return;
-        }
-
-        case NI_Vector128_ToScalar:
-        case NI_Vector256_ToScalar:
-        {
-            LowerHWIntrinsicToScalar(node);
-            break;
-        }
-
-        case NI_SSE2_Insert:
-        case NI_SSE41_Insert:
-        case NI_SSE41_X64_Insert:
-        {
-            assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
-
-            GenTreeArgList* argList = node->gtOp1->AsArgList();
-
-            // Insert takes either a 32-bit register or a memory operand.
-            // In either case, only gtSIMDBaseType bits are read and so
-            // widening or narrowing the operand may be unnecessary and it
-            // can just be used directly.
-
-            argList->Rest()->gtOp1 = TryRemoveCastIfPresent(node->gtSIMDBaseType, argList->Rest()->gtOp1);
-            break;
-        }
-
-        case NI_SSE42_Crc32:
-        {
-            assert(HWIntrinsicInfo::lookupNumArgs(node) == 2);
-
-            // Crc32 takes either a bit register or a memory operand.
-            // In either case, only gtType bits are read and so widening
-            // or narrowing the operand may be unnecessary and it can
-            // just be used directly.
-
-            node->gtOp2 = TryRemoveCastIfPresent(node->gtType, node->gtOp2);
-            break;
-        }
-
-        case NI_SSE2_CompareGreaterThan:
-        {
-            if (node->gtSIMDBaseType != TYP_DOUBLE)
-            {
-                assert(varTypeIsIntegral(node->gtSIMDBaseType));
-                break;
-            }
-
-            FALLTHROUGH;
-        }
-
-        case NI_SSE_CompareGreaterThan:
-        case NI_SSE_CompareGreaterThanOrEqual:
-        case NI_SSE_CompareNotGreaterThan:
-        case NI_SSE_CompareNotGreaterThanOrEqual:
-        case NI_SSE2_CompareGreaterThanOrEqual:
-        case NI_SSE2_CompareNotGreaterThan:
-        case NI_SSE2_CompareNotGreaterThanOrEqual:
-        {
-            assert((node->gtSIMDBaseType == TYP_FLOAT) || (node->gtSIMDBaseType == TYP_DOUBLE));
-
-            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
-            {
-                break;
-            }
-
-            // pre-AVX doesn't actually support these intrinsics in hardware so we need to swap the operands around
-            std::swap(node->gtOp1, node->gtOp2);
-            break;
-        }
-
-        case NI_SSE2_CompareLessThan:
-        case NI_SSE42_CompareLessThan:
-        case NI_AVX2_CompareLessThan:
-        {
-            if (node->gtSIMDBaseType == TYP_DOUBLE)
-            {
-                break;
-            }
-            assert(varTypeIsIntegral(node->gtSIMDBaseType));
-
-            // this isn't actually supported in hardware so we need to swap the operands around
-            std::swap(node->gtOp1, node->gtOp2);
-            break;
-        }
-
-        case NI_SSE_CompareScalarOrderedEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FEQ);
-            break;
-        case NI_SSE_CompareScalarOrderedNotEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FNEU);
-            break;
-        case NI_SSE_CompareScalarOrderedLessThan:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FLT);
-            break;
-        case NI_SSE_CompareScalarOrderedLessThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FLE);
-            break;
-        case NI_SSE_CompareScalarOrderedGreaterThan:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FGT);
-            break;
-        case NI_SSE_CompareScalarOrderedGreaterThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_COMISS, GenCondition::FGE);
-            break;
-
-        case NI_SSE_CompareScalarUnorderedEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FEQ);
-            break;
-        case NI_SSE_CompareScalarUnorderedNotEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FNEU);
-            break;
-        case NI_SSE_CompareScalarUnorderedLessThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FLE);
-            break;
-        case NI_SSE_CompareScalarUnorderedLessThan:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FLT);
-            break;
-        case NI_SSE_CompareScalarUnorderedGreaterThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FGE);
-            break;
-        case NI_SSE_CompareScalarUnorderedGreaterThan:
-            LowerHWIntrinsicCC(node, NI_SSE_UCOMISS, GenCondition::FGT);
-            break;
-
-        case NI_SSE2_CompareScalarOrderedEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FEQ);
-            break;
-        case NI_SSE2_CompareScalarOrderedNotEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FNEU);
-            break;
-        case NI_SSE2_CompareScalarOrderedLessThan:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FLT);
-            break;
-        case NI_SSE2_CompareScalarOrderedLessThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FLE);
-            break;
-        case NI_SSE2_CompareScalarOrderedGreaterThan:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FGT);
-            break;
-        case NI_SSE2_CompareScalarOrderedGreaterThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_COMISD, GenCondition::FGE);
-            break;
-
-        case NI_SSE2_CompareScalarUnorderedEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FEQ);
-            break;
-        case NI_SSE2_CompareScalarUnorderedNotEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FNEU);
-            break;
-        case NI_SSE2_CompareScalarUnorderedLessThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FLE);
-            break;
-        case NI_SSE2_CompareScalarUnorderedLessThan:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FLT);
-            break;
-        case NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FGE);
-            break;
-        case NI_SSE2_CompareScalarUnorderedGreaterThan:
-            LowerHWIntrinsicCC(node, NI_SSE2_UCOMISD, GenCondition::FGT);
-            break;
-
-        case NI_SSE41_TestC:
-            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::C);
-            break;
-        case NI_SSE41_TestZ:
-            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::EQ);
-            break;
-        case NI_SSE41_TestNotZAndNotC:
-            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, GenCondition::UGT);
-            break;
-
-        case NI_AVX_TestC:
-            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::C);
-            break;
-        case NI_AVX_TestZ:
-            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::EQ);
-            break;
-        case NI_AVX_TestNotZAndNotC:
-            LowerHWIntrinsicCC(node, NI_AVX_PTEST, GenCondition::UGT);
-            break;
-
-        case NI_FMA_MultiplyAddScalar:
-            LowerFusedMultiplyAdd(node);
-            break;
-
-        default:
-            break;
-    }
-
-    ContainCheckHWIntrinsic(node);
-}
-
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic
-//
-//  Arguments:
-//     node  - The hardware intrinsic node.
-//     cmpOp - The comparison operation, currently must be GT_EQ or GT_NE
-//
-void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp)
-{
-    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
-    var_types      baseType    = node->gtSIMDBaseType;
-    unsigned       simdSize    = node->gtSIMDSize;
-    var_types      simdType    = Compiler::getSIMDTypeForSize(simdSize);
-
-    assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) ||
-           (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality));
-
-    assert(varTypeIsSIMD(simdType));
-    assert(varTypeIsArithmetic(baseType));
-    assert(simdSize != 0);
-    assert(node->gtType == TYP_BOOL);
-    assert((cmpOp == GT_EQ) || (cmpOp == GT_NE));
-
-    // We have the following (with the appropriate simd size and where the intrinsic could be op_Inequality):
-    //          /--*  op2  simd
-    //          /--*  op1  simd
-    //   node = *  HWINTRINSIC   simd   T op_Equality
-
-    GenTree* op1 = node->gtGetOp1();
-    GenTree* op2 = node->gtGetOp2();
-
-    GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE;
-
-    if (op2->IsIntegralConstVector(0) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-    {
-        // On SSE4.1 or higher we can optimize comparisons against zero to
-        // just use PTEST. We can't support it for floating-point, however,
-        // as it has both +0.0 and -0.0 where +0.0 == -0.0
-
-        node->gtOp1 = op1;
-        BlockRange().Remove(op2);
-
-        LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
-        ReplaceWithLclVar(op1Use);
-        op1 = node->gtOp1;
-
-        op2 = comp->gtClone(op1);
-        BlockRange().InsertAfter(op1, op2);
-        node->gtOp2 = op2;
-
-        if (simdSize == 32)
-        {
-            node->gtHWIntrinsicId = NI_AVX_TestZ;
-            LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
-        }
-        else
-        {
-            node->gtHWIntrinsicId = NI_SSE41_TestZ;
-            LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
-        }
-
-        return;
-    }
-
-    NamedIntrinsic cmpIntrinsic;
-    var_types      cmpType;
-    NamedIntrinsic mskIntrinsic;
-    var_types      mskType;
-    int            mskConstant;
-
-    switch (baseType)
-    {
-        case TYP_BYTE:
-        case TYP_UBYTE:
-        case TYP_SHORT:
-        case TYP_USHORT:
-        case TYP_INT:
-        case TYP_UINT:
-        {
-            cmpType = baseType;
-            mskType = TYP_UBYTE;
-
-            if (simdSize == 32)
-            {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
-                mskIntrinsic = NI_AVX2_MoveMask;
-                mskConstant  = -1;
-            }
-            else
-            {
-                assert(simdSize == 16);
-
-                cmpIntrinsic = NI_SSE2_CompareEqual;
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
-            }
-            break;
-        }
-
-        case TYP_LONG:
-        case TYP_ULONG:
-        {
-            mskType = TYP_UBYTE;
-
-            if (simdSize == 32)
-            {
-                cmpIntrinsic = NI_AVX2_CompareEqual;
-                cmpType      = baseType;
-                mskIntrinsic = NI_AVX2_MoveMask;
-                mskConstant  = -1;
-            }
-            else
-            {
-                assert(simdSize == 16);
-
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-                {
-                    cmpIntrinsic = NI_SSE41_CompareEqual;
-                    cmpType      = baseType;
-                }
-                else
-                {
-                    cmpIntrinsic = NI_SSE2_CompareEqual;
-                    cmpType      = TYP_UINT;
-                }
-
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0xFFFF;
-            }
-            break;
-        }
-
-        case TYP_FLOAT:
-        {
-            cmpType = baseType;
-            mskType = baseType;
-
-            if (simdSize == 32)
-            {
-                cmpIntrinsic = NI_AVX_CompareEqual;
-                mskIntrinsic = NI_AVX_MoveMask;
-                mskConstant  = 0xFF;
-            }
-            else
-            {
-                cmpIntrinsic = NI_SSE_CompareEqual;
-                mskIntrinsic = NI_SSE_MoveMask;
-
-                if (simdSize == 16)
-                {
-                    mskConstant = 0xF;
-                }
-                else if (simdSize == 12)
-                {
-                    mskConstant = 0x7;
-                }
-                else
-                {
-                    assert(simdSize == 8);
-                    mskConstant = 0x3;
-                }
-            }
-            break;
-        }
-
-        case TYP_DOUBLE:
-        {
-            cmpType = baseType;
-            mskType = baseType;
-
-            if (simdSize == 32)
-            {
-                cmpIntrinsic = NI_AVX_CompareEqual;
-                mskIntrinsic = NI_AVX_MoveMask;
-                mskConstant  = 0xF;
-            }
-            else
-            {
-                assert(simdSize == 16);
-
-                cmpIntrinsic = NI_SSE2_CompareEqual;
-                mskIntrinsic = NI_SSE2_MoveMask;
-                mskConstant  = 0x3;
-            }
-            break;
-        }
-
-        default:
-        {
-            unreached();
-        }
-    }
-
-    GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, cmpIntrinsic, cmpType, simdSize);
-    BlockRange().InsertBefore(node, cmp);
-    LowerNode(cmp);
-
-    GenTree* msk = comp->gtNewSimdHWIntrinsicNode(TYP_INT, cmp, mskIntrinsic, mskType, simdSize);
-    BlockRange().InsertAfter(cmp, msk);
-    LowerNode(msk);
-
-    GenTree* mskCns = comp->gtNewIconNode(mskConstant, TYP_INT);
-    BlockRange().InsertAfter(msk, mskCns);
-
-    if ((baseType == TYP_FLOAT) && (simdSize < 16))
-    {
-        // For TYP_SIMD8 and TYP_SIMD12 we need to clear the upper bits and can't assume their value
-
-        GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, msk, mskCns);
-        BlockRange().InsertAfter(mskCns, tmp);
-        LowerNode(tmp);
-
-        msk = tmp;
-
-        mskCns = comp->gtNewIconNode(mskConstant, TYP_INT);
-        BlockRange().InsertAfter(msk, mskCns);
-    }
-
-    node->ChangeOper(cmpOp);
-
-    node->gtType = TYP_INT;
-    node->gtOp1  = msk;
-    node->gtOp2  = mskCns;
-
-    GenTree* cc = LowerNodeCC(node, cmpCnd);
-
-    node->gtType = TYP_VOID;
-    node->ClearUnusedValue();
-
-    LowerNode(node);
-}
-
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call
-//
-//  Arguments:
-//     node - The hardware intrinsic node.
-//
-void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
-{
-    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
-    var_types      simdType    = node->gtType;
-    var_types      baseType    = node->gtSIMDBaseType;
-    unsigned       simdSize    = node->gtSIMDSize;
-    VectorConstant vecCns      = {};
-
-    if ((simdSize == 8) && (simdType == TYP_DOUBLE))
-    {
-        // TODO-Cleanup: Struct retyping means we have the wrong type here. We need to
-        //               manually fix it up so the simdType checks below are correct.
-        simdType = TYP_SIMD8;
-    }
-
-    assert(varTypeIsSIMD(simdType));
-    assert(varTypeIsArithmetic(baseType));
-    assert(simdSize != 0);
-
-    GenTreeArgList* argList = nullptr;
-    GenTree*        op1     = node->gtGetOp1();
-    GenTree*        op2     = node->gtGetOp2();
-
-    // Spare GenTrees to be used for the lowering logic below
-    // Defined upfront to avoid naming conflicts, etc...
-    GenTree* idx  = nullptr;
-    GenTree* tmp1 = nullptr;
-    GenTree* tmp2 = nullptr;
-    GenTree* tmp3 = nullptr;
-
-    assert(op1 != nullptr);
-
-    unsigned argCnt    = 0;
-    unsigned cnsArgCnt = 0;
-
-    if (op1->OperIsList())
-    {
-        assert(op2 == nullptr);
-
-        for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
-        {
-            if (HandleArgForHWIntrinsicCreate(argList->Current(), argCnt, vecCns, baseType))
-            {
-                cnsArgCnt += 1;
-            }
-            argCnt += 1;
-        }
-    }
-    else
-    {
-        if (HandleArgForHWIntrinsicCreate(op1, argCnt, vecCns, baseType))
-        {
-            cnsArgCnt += 1;
-        }
-        argCnt += 1;
-
-        if (op2 != nullptr)
-        {
-            if (HandleArgForHWIntrinsicCreate(op2, argCnt, vecCns, baseType))
-            {
-                cnsArgCnt += 1;
-            }
-            argCnt += 1;
-        }
-        else if (cnsArgCnt == 1)
-        {
-            // These intrinsics are meant to set the same value to every element
-            // so we'll just specially handle it here and copy it into the remaining
-            // indices.
-
-            for (unsigned i = 1; i < simdSize / genTypeSize(baseType); i++)
-            {
-                HandleArgForHWIntrinsicCreate(op1, i, vecCns, baseType);
-            }
-        }
-    }
-    assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(baseType))));
-
-    if (argCnt == cnsArgCnt)
-    {
-        if (op1->OperIsList())
-        {
-            for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest())
-            {
-                GenTree* arg = argList->Current();
-
-#if !defined(TARGET_64BIT)
-                if (arg->OperIsLong())
-                {
-                    BlockRange().Remove(arg->AsOp()->gtOp1);
-                    BlockRange().Remove(arg->AsOp()->gtOp2);
-                }
-#endif // !TARGET_64BIT
-
-                BlockRange().Remove(arg);
-            }
-        }
-        else
-        {
-#if !defined(TARGET_64BIT)
-            if (op1->OperIsLong())
-            {
-                BlockRange().Remove(op1->AsOp()->gtOp1);
-                BlockRange().Remove(op1->AsOp()->gtOp2);
-            }
-#endif // !TARGET_64BIT
-
-            BlockRange().Remove(op1);
-
-            if (op2 != nullptr)
-            {
-#if defined(TARGET_64BIT)
-                if (op2->OperIsLong())
-                {
-                    BlockRange().Remove(op2->AsOp()->gtOp1);
-                    BlockRange().Remove(op2->AsOp()->gtOp2);
-                }
-#endif // !TARGET_64BIT
-
-                BlockRange().Remove(op2);
-            }
-        }
-
-        assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32));
-
-        if ((argCnt == 1) ||
-            ((vecCns.i64[0] == vecCns.i64[1]) && ((simdSize <= 16) || (vecCns.i64[2] == vecCns.i64[3]))))
-        {
-            // If we are a single constant or if all parts are the same, we might be able to optimize
-            // this even further for certain values, such as Zero or AllBitsSet.
-
-            if (vecCns.i64[0] == 0)
-            {
-                node->gtOp1 = nullptr;
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_Vector128_get_Zero;
-                return;
-            }
-            else if (vecCns.i64[0] == -1)
-            {
-                node->gtOp1 = nullptr;
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_Vector128_get_AllBitsSet;
-                return;
-            }
-        }
-
-        unsigned cnsSize = (simdSize != 12) ? simdSize : 16;
-        unsigned cnsAlign =
-            (comp->compCodeOpt() != Compiler::SMALL_CODE) ? cnsSize : emitter::dataSection::MIN_DATA_ALIGN;
-        var_types dataType = Compiler::getSIMDTypeForSize(simdSize);
-
-        UNATIVE_OFFSET       cnum = comp->GetEmitter()->emitDataConst(&vecCns, cnsSize, cnsAlign, dataType);
-        CORINFO_FIELD_HANDLE hnd  = comp->eeFindJitDataOffs(cnum);
-        GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr);
-        BlockRange().InsertBefore(node, clsVarAddr);
-
-        node->ChangeOper(GT_IND);
-        node->gtOp1 = clsVarAddr;
-
-        // TODO-XARCH-CQ: We should be able to modify at least the paths that use Insert to trivially support partial
-        // vector constants. With this, we can create a constant if say 50% of the inputs are also constant and just
-        // insert the non-constant values which should still allow some gains.
-
-        return;
-    }
-    else if (argCnt == 1)
-    {
-        // We have the following (where simd is simd16 or simd32):
-        //          /--*  op1  T
-        //   node = *  HWINTRINSIC   simd   T Create
-
-        if (intrinsicId == NI_Vector256_Create)
-        {
-            if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-            {
-                // We will be constructing the following parts:
-                //          /--*  op1  T
-                //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                //          /--*  tmp1 simd16
-                //   node = *  HWINTRINSIC   simd32 T BroadcastScalarToVector256
-
-                // This is roughly the following managed code:
-                //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
-                //   return Avx2.BroadcastScalarToVector256(tmp1);
-
-                tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-                BlockRange().InsertAfter(op1, tmp1);
-                LowerNode(tmp1);
-
-                node->gtOp1 = tmp1;
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector256;
-                return;
-            }
-
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
-
-            // We will be constructing the following parts:
-            //          /--*  op1  T
-            //   tmp1 = *  HWINTRINSIC   simd16 T Create
-            //          /--*  tmp1 simd16
-            //          *  STORE_LCL_VAR simd16
-            //   tmp1 =    LCL_VAR       simd16
-            //   tmp2 =    LCL_VAR       simd16
-            //          /--*  tmp2 simd16
-            //   tmp3 = *  HWINTRINSIC   simd16 T ToVector256Unsafe
-            //   idx  =    CNS_INT       int    0
-            //          /--*  tmp3 simd32
-            //          +--*  tmp1 simd16
-            //          +--*  idx  int
-            //   node = *  HWINTRINSIC simd32 T InsertVector128
-
-            // This is roughly the following managed code:
-            //   var tmp1 = Vector128.Create(op1);
-            //   var tmp2 = tmp1;
-            //   var tmp3 = tmp2.ToVector256Unsafe();
-            //   return Avx.InsertVector128(tmp3, tmp1, 0x01);
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
-            BlockRange().InsertAfter(op1, tmp1);
-            LowerNode(tmp1);
-
-            node->gtOp1 = tmp1;
-            LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-            ReplaceWithLclVar(tmp1Use);
-            tmp1 = node->gtOp1;
-
-            tmp2 = comp->gtClone(tmp1);
-            BlockRange().InsertAfter(tmp1, tmp2);
-
-            tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, tmp2, NI_Vector128_ToVector256Unsafe, baseType, 16);
-            BlockRange().InsertAfter(tmp2, tmp3);
-            LowerNode(tmp3);
-
-            idx = comp->gtNewIconNode(0x01, TYP_INT);
-            BlockRange().InsertAfter(tmp3, idx);
-
-            node->gtOp1 = comp->gtNewArgList(tmp3, tmp1, idx);
-            node->gtOp2 = nullptr;
-
-            node->gtHWIntrinsicId = NI_AVX_InsertVector128;
-            return;
-        }
-
-        // We will be constructing the following parts:
-        //          /--*  op1  T
-        //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-        //   ...
-
-        // This is roughly the following managed code:
-        //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
-        //   ...
-
-        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-        BlockRange().InsertAfter(op1, tmp1);
-        LowerNode(tmp1);
-
-        if ((baseType != TYP_DOUBLE) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-        {
-            // We will be constructing the following parts:
-            //   ...
-            //           /--*  tmp1 simd16
-            //   node  = *  HWINTRINSIC   simd16 T BroadcastScalarToVector128
-
-            // This is roughly the following managed code:
-            //   ...
-            //   return Avx2.BroadcastScalarToVector128(tmp1);
-
-            node->gtOp1 = tmp1;
-            node->gtOp2 = nullptr;
-
-            node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector128;
-            return;
-        }
-
-        switch (baseType)
-        {
-            case TYP_BYTE:
-            case TYP_UBYTE:
-            {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //   tmp2 =    HWINTRINSIC   simd16 ubyte get_Zero
-                    //         /--*  tmp1 simd16
-                    //         +--*  tmp2 simd16
-                    //   node = *  HWINTRINSIC   simd16 ubyte Shuffle
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   var tmp2 = Vector128<byte>.Zero;
-                    //   return Ssse3.Shuffle(tmp1, tmp2);
-
-                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, NI_Vector128_get_Zero, TYP_UBYTE, simdSize);
-                    BlockRange().InsertAfter(tmp1, tmp2);
-                    LowerNode(tmp2);
-
-                    node->gtOp1 = tmp1;
-                    node->gtOp2 = tmp2;
-
-                    node->gtHWIntrinsicId = NI_SSSE3_Shuffle;
-                    break;
-                }
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   tmp1 = *  HWINTRINSIC   simd16 ubyte UnpackLow
-                //   ...
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
-                //   ...
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
-                BlockRange().InsertAfter(tmp2, tmp1);
-                LowerNode(tmp1);
-
-                FALLTHROUGH;
-            }
-
-            case TYP_SHORT:
-            case TYP_USHORT:
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   tmp1 = *  HWINTRINSIC   simd16 ushort UnpackLow
-                //   ...
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   tmp1 = Sse2.UnpackLow(tmp1, tmp2);
-                //   ...
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
-                BlockRange().InsertAfter(tmp2, tmp1);
-                LowerNode(tmp1);
-
-                FALLTHROUGH;
-            }
-
-            case TYP_INT:
-            case TYP_UINT:
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //   idx  =    CNS_INT       int    0
-                //          /--*  tmp1 simd16
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 uint Shuffle
-
-                // This is roughly the following managed code:
-                //   ...
-                //   return Sse2.Shuffle(tmp1, 0x00);
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                idx = comp->gtNewIconNode(0x00, TYP_INT);
-                BlockRange().InsertAfter(tmp1, idx);
-
-                node->gtOp1 = tmp1;
-                node->gtOp2 = idx;
-
-                node->gtHWIntrinsicId = NI_SSE2_Shuffle;
-                node->gtSIMDBaseType  = TYP_UINT;
-
-                break;
-            }
-
-#if defined(TARGET_AMD64)
-            case TYP_LONG:
-            case TYP_ULONG:
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   node = *  HWINTRINSIC simd16 ulong UnpackLow
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   return Sse2.UnpackLow(tmp1, tmp2);
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                node->gtOp1 = tmp1;
-                node->gtOp2 = tmp2;
-
-                node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
-                break;
-            }
-#endif // TARGET_AMD64
-
-            case TYP_FLOAT:
-            {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //   idx  =    CNS_INT       int    0
-                    //          /--*  tmp1 simd16
-                    //          +--*  idx  int
-                    //   node = *  HWINTRINSIC   simd16 float Permute
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   return Avx.Permute(tmp1, 0x00);
-
-                    idx = comp->gtNewIconNode(0x00, TYP_INT);
-                    BlockRange().InsertAfter(tmp1, idx);
-
-                    node->gtOp1 = tmp1;
-                    node->gtOp2 = idx;
-
-                    node->gtHWIntrinsicId = NI_AVX_Permute;
-                    break;
-                }
-
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //   idx  =    CNS_INT       int    0
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 float Shuffle
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   return Sse.Shuffle(tmp1, tmp2, 0x00);
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                idx = comp->gtNewIconNode(0x00, TYP_INT);
-                BlockRange().InsertAfter(tmp2, idx);
-
-                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_SSE_Shuffle;
-                break;
-            }
-
-            case TYP_DOUBLE:
-            {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //          /--*  tmp1 simd16
-                    //   node = *  HWINTRINSIC   simd16 double MoveAndDuplicate
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   return Sse3.MoveAndDuplicate(tmp1);
-
-                    node->gtOp1 = tmp1;
-                    node->gtOp2 = nullptr;
-
-                    node->gtHWIntrinsicId = NI_SSE3_MoveAndDuplicate;
-                    break;
-                }
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   node = *  HWINTRINSIC   simd16 float MoveLowToHigh
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   return Sse.MoveLowToHigh(tmp1, tmp2);
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                node->gtOp1 = tmp1;
-                node->gtOp2 = tmp2;
-
-                node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
-                node->gtSIMDBaseType  = TYP_FLOAT;
-
-                break;
-            }
-
-            default:
-            {
-                unreached();
-            }
-        }
-
-        return;
-    }
-
-    // We have the following (where simd is simd16 or simd32):
-    //          /--*  op1 T
-    //          +--*  ... T
-    //          +--*  opN T
-    //   node = *  HWINTRINSIC   simd   T Create
-
-    if (intrinsicId == NI_Vector256_Create)
-    {
-        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
-
-        // We will be constructing the following parts:
-        //          /--*  op1 T
-        //          +--*  ... T
-        //   lo   = *  HWINTRINSIC   simd16 T Create
-        //          /--*  ... T
-        //          +--*  opN T
-        //   hi   = *  HWINTRINSIC   simd16 T Create
-        //   idx  =    CNS_INT       int    1
-        //          /--*  lo   simd32
-        //          +--*  hi   simd16
-        //          +--*  idx  int
-        //   node = *  HWINTRINSIC   simd32 T InsertVector128
-
-        // This is roughly the following managed code:
-        //   ...
-        //   var lo   = Vector128.Create(op1, ...);
-        //   var hi   = Vector128.Create(..., opN);
-        //   return Avx.InsertVector128(lo, hi, 0x01);
-
-        // Each Vector128.Create call gets half the operands. That is:
-        //   lo = Vector128.Create(op1, op2);
-        //   hi = Vector128.Create(op3, op4);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op3);
-        //   hi = Vector128.Create(op4,  ..., op7);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op7);
-        //   hi = Vector128.Create(op8,  ..., op15);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op15);
-        //   hi = Vector128.Create(op16, ..., op31);
-
-        unsigned halfArgCnt = argCnt / 2;
-        assert((halfArgCnt * 2) == argCnt);
-
-        argList = op1->AsArgList();
-
-        for (unsigned i = 0; i < halfArgCnt; i++)
-        {
-            op2     = argList;
-            argList = argList->Rest();
-        }
-
-        op2->AsArgList()->gtOp2 = nullptr;
-        op2                     = argList;
-
-        // The above for loop splits the operand count into exactly half.
-        // Once it exits, op1 will point to op1 and op2 will point to the
-        // last operand that will be passed to the first Vector128.Create
-        // We will set its op2 to null, terminating the chain and then
-        // assign  op2 to be argList, which is the first operand that will
-        // get passed to the second Vector128.Create
-
-        GenTree* lo = nullptr;
-        GenTree* hi = nullptr;
-
-        if (halfArgCnt == 2)
-        {
-            // The Vector256.Create calls that take 4 operands are special
-            // because the half argument count is 2, which means we can't
-            // actually use the GT_LIST anymore and need to pass them as
-            // explicit operands instead.
-
-            argList = op1->AsArgList();
-
-            tmp1 = argList->Current();
-            tmp2 = argList->Rest()->Current();
-
-            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
-            BlockRange().InsertAfter(tmp2, lo);
-            LowerNode(lo);
-
-            argList = op2->AsArgList();
-
-            tmp1 = argList->Current();
-            tmp2 = argList->Rest()->Current();
-
-            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16);
-            BlockRange().InsertAfter(tmp2, hi);
-            LowerNode(hi);
-        }
-        else
-        {
-            // The rest of the Vector256.Create calls take at least 8 operands
-            // and so the half count is at least 4 and we have to continue
-            // passing around GT_LIST nodes in op1 with a null op2
-            assert(halfArgCnt >= 4);
-
-            tmp1 = op2->AsArgList()->Current();
-
-            lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16);
-            BlockRange().InsertBefore(tmp1, lo);
-            LowerNode(lo);
-
-            hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_Create, baseType, 16);
-            BlockRange().InsertBefore(node, hi);
-            LowerNode(hi);
-        }
-
-        idx = comp->gtNewIconNode(0x01, TYP_INT);
-        BlockRange().InsertAfter(hi, idx);
-
-        node->gtOp1 = comp->gtNewArgList(lo, hi, idx);
-        node->gtOp2 = nullptr;
-
-        node->gtHWIntrinsicId = NI_AVX_InsertVector128;
-        return;
-    }
-
-    if (op1->OperIsList())
-    {
-        argList = op1->AsArgList();
-        op1     = argList->Current();
-        argList = argList->Rest();
-    }
-
-    // We will be constructing the following parts:
-    //          /--*  op1  T
-    //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-    //   ...
-
-    // This is roughly the following managed code:
-    //   var tmp1 = Vector128.CreateScalarUnsafe(op1);
-    //   ...
-
-    tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-    BlockRange().InsertAfter(op1, tmp1);
-    LowerNode(tmp1);
-
-    switch (baseType)
-    {
-        case TYP_BYTE:
-        case TYP_UBYTE:
-        case TYP_SHORT:
-        case TYP_USHORT:
-        case TYP_INT:
-        case TYP_UINT:
-        {
-            unsigned       N            = 0;
-            GenTree*       opN          = nullptr;
-            NamedIntrinsic insIntrinsic = NI_Illegal;
-
-            if ((baseType == TYP_SHORT) || (baseType == TYP_USHORT))
-            {
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-                insIntrinsic = NI_SSE2_Insert;
-            }
-            else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-            {
-                insIntrinsic = NI_SSE41_Insert;
-            }
-
-            if (insIntrinsic != NI_Illegal)
-            {
-                for (N = 1; N < argCnt - 1; N++)
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //   idx  =    CNS_INT       int    N
-                    //          /--*  tmp1 simd16
-                    //          +--*  opN  T
-                    //          +--*  idx  int
-                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
-                    //   ...
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   tmp1 = Sse?.Insert(tmp1, opN, N);
-                    //   ...
-
-                    opN = argList->Current();
-
-                    idx = comp->gtNewIconNode(N, TYP_INT);
-                    BlockRange().InsertAfter(opN, idx);
-
-                    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, baseType, simdSize);
-                    BlockRange().InsertAfter(idx, tmp1);
-                    LowerNode(tmp1);
-
-                    argList = argList->Rest();
-                }
-
-                assert(N == (argCnt - 1));
-
-                // We will be constructing the following parts:
-                //   idx  =    CNS_INT       int    N
-                //          /--*  tmp1 simd16
-                //          +--*  opN  T
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 T Insert
-
-                // This is roughly the following managed code:
-                //   ...
-                //   tmp1 = Sse?.Insert(tmp1, opN, N);
-                //   ...
-
-                opN = argList->Current();
-
-                idx = comp->gtNewIconNode(N, TYP_INT);
-                BlockRange().InsertAfter(opN, idx);
-
-                node->gtOp1 = comp->gtNewArgList(tmp1, opN, idx);
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = insIntrinsic;
-                break;
-            }
-
-            assert((baseType != TYP_SHORT) && (baseType != TYP_USHORT));
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-            GenTree* op[16];
-            op[0] = tmp1;
-
-            for (N = 1; N < argCnt; N++)
-            {
-                opN = argList->Current();
-
-                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-                BlockRange().InsertAfter(opN, op[N]);
-                LowerNode(op[N]);
-
-                argList = argList->Rest();
-            }
-            assert(argList == nullptr);
-
-            if ((baseType == TYP_BYTE) || (baseType == TYP_UBYTE))
-            {
-                for (N = 0; N < argCnt; N += 4)
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //          /--*  opN  T
-                    //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                    //          /--*  opO  T
-                    //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                    //          /--*  opN  simd16
-                    //          +--*  opO  simd16
-                    //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
-                    //          /--*  opP  T
-                    //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                    //          /--*  opQ  T
-                    //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                    //          /--*  opP  simd16
-                    //          +--*  opQ  simd16
-                    //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
-                    //          /--*  tmp1 simd16
-                    //          +--*  tmp2 simd16
-                    //   tmp3  = *  HWINTRINSIC   simd16 T UnpackLow
-                    //   ...
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   tmp1 = Sse2.UnpackLow(opN, opO);
-                    //   tmp2 = Sse2.UnpackLow(opP, opQ);
-                    //   tmp3 = Sse2.UnpackLow(tmp1, tmp2);
-                    //   ...
-
-                    unsigned O = N + 1;
-                    unsigned P = N + 2;
-                    unsigned Q = N + 3;
-
-                    tmp1 =
-                        comp->gtNewSimdHWIntrinsicNode(simdType, op[N], op[O], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
-                    BlockRange().InsertAfter(op[O], tmp1);
-                    LowerNode(tmp1);
-
-                    tmp2 =
-                        comp->gtNewSimdHWIntrinsicNode(simdType, op[P], op[Q], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize);
-                    BlockRange().InsertAfter(op[Q], tmp2);
-                    LowerNode(tmp2);
-
-                    tmp3 =
-                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize);
-                    BlockRange().InsertAfter(tmp2, tmp3);
-                    LowerNode(tmp3);
-
-                    // This caches the result in index 0 through 3, depending on which
-                    // loop iteration this is and allows the rest of the logic to be
-                    // shared with the TYP_INT and TYP_UINT path.
-
-                    op[N / 4] = tmp3;
-                }
-            }
-
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  opN  T
-            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opO  T
-            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opN  simd16
-            //          +--*  opO  simd16
-            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
-            //          /--*  opP  T
-            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opQ  T
-            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opP  simd16
-            //          +--*  opQ  simd16
-            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T UnpackLow
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Sse2.UnpackLow(opN, opO);
-            //   tmp2 = Sse2.UnpackLow(opP, opQ);
-            //   return Sse2.UnpackLow(tmp1, tmp2);
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
-            BlockRange().InsertAfter(op[1], tmp1);
-            LowerNode(tmp1);
-
-            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE2_UnpackLow, TYP_UINT, simdSize);
-            BlockRange().InsertAfter(op[3], tmp2);
-            LowerNode(tmp2);
-
-            node->gtOp1 = tmp1;
-            node->gtOp2 = tmp2;
-
-            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
-            node->gtSIMDBaseType  = TYP_ULONG;
-            break;
-        }
-
-#if defined(TARGET_AMD64)
-        case TYP_LONG:
-        case TYP_ULONG:
-        {
-            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64))
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //   idx  =    CNS_INT       int    1
-                //          /--*  tmp1 simd16
-                //          +--*  op2  T
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 T Insert
-
-                // This is roughly the following managed code:
-                //   ...
-                //   return Sse41.X64.Insert(tmp1, op2, 0x01);
-
-                idx = comp->gtNewIconNode(0x01, TYP_INT);
-                BlockRange().InsertBefore(node, idx);
-
-                node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx);
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_SSE41_X64_Insert;
-                break;
-            }
-
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  op2  T
-            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T UnpackLow
-
-            // This is roughly the following managed code:
-            //   ...
-            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
-            //   return Sse2.UnpackLow(tmp1, tmp2);
-
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-            BlockRange().InsertAfter(op2, tmp2);
-            LowerNode(tmp2);
-
-            node->gtOp1 = tmp1;
-            node->gtOp2 = tmp2;
-
-            node->gtHWIntrinsicId = NI_SSE2_UnpackLow;
-            break;
-        }
-#endif // TARGET_AMD64
-
-        case TYP_FLOAT:
-        {
-            unsigned N   = 0;
-            GenTree* opN = nullptr;
-
-            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-            {
-                for (N = 1; N < argCnt - 1; N++)
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //
-                    //          /--*  opN  T
-                    //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                    //   idx  =    CNS_INT       int    N
-                    //          /--*  tmp1 simd16
-                    //          +--*  opN  T
-                    //          +--*  idx  int
-                    //   tmp1 = *  HWINTRINSIC   simd16 T Insert
-                    //   ...
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   tmp2 = Vector128.CreateScalarUnsafe(opN);
-                    //   tmp1 = Sse41.Insert(tmp1, tmp2, N << 4);
-                    //   ...
-
-                    opN = argList->Current();
-
-                    tmp2 =
-                        comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-                    BlockRange().InsertAfter(opN, tmp2);
-                    LowerNode(tmp2);
-
-                    idx = comp->gtNewIconNode(N << 4, TYP_INT);
-                    BlockRange().InsertAfter(tmp2, idx);
-
-                    tmp1 =
-                        comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_SSE41_Insert, baseType, simdSize);
-                    BlockRange().InsertAfter(idx, tmp1);
-                    LowerNode(tmp1);
-
-                    argList = argList->Rest();
-                }
-
-                // We will be constructing the following parts:
-                //   ...
-                //
-                //          /--*  opN  T
-                //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-                //   idx  =    CNS_INT       int    N
-                //          /--*  tmp1 simd16
-                //          +--*  opN  T
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 T Insert
-
-                // This is roughly the following managed code:
-                //   ...
-                //   tmp2 = Vector128.CreateScalarUnsafe(opN);
-                //   return Sse41.Insert(tmp1, tmp2, N << 4);
-
-                opN = argList->Current();
-
-                tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-                BlockRange().InsertAfter(opN, tmp2);
-                LowerNode(tmp2);
-
-                idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT);
-                BlockRange().InsertAfter(tmp2, idx);
-
-                node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx);
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_SSE41_Insert;
-                break;
-            }
-
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  opN  T
-            //   opN  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opO  T
-            //   opO  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opN  simd16
-            //          +--*  opO  simd16
-            //   tmp1 = *  HWINTRINSIC   simd16 T UnpackLow
-            //          /--*  opP  T
-            //   opP  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opQ  T
-            //   opQ  = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  opP  simd16
-            //          +--*  opQ  simd16
-            //   tmp2 = *  HWINTRINSIC   simd16 T UnpackLow
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Sse.UnpackLow(opN, opO);
-            //   tmp2 = Sse.UnpackLow(opP, opQ);
-            //   return Sse.MoveLowToHigh(tmp1, tmp2);
-
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE));
-
-            GenTree* op[4];
-            op[0] = tmp1;
-
-            for (N = 1; N < argCnt; N++)
-            {
-                opN = argList->Current();
-
-                op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-                BlockRange().InsertAfter(opN, op[N]);
-                LowerNode(op[N]);
-
-                argList = argList->Rest();
-            }
-            assert(argList == nullptr);
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE_UnpackLow, baseType, simdSize);
-            BlockRange().InsertAfter(op[1], tmp1);
-            LowerNode(tmp1);
-
-            tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE_UnpackLow, baseType, simdSize);
-            BlockRange().InsertAfter(op[3], tmp2);
-            LowerNode(tmp2);
-
-            node->gtOp1 = tmp1;
-            node->gtOp2 = tmp2;
-
-            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
-            break;
-        }
-
-        case TYP_DOUBLE:
-        {
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  op2  T
-            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
-
-            // This is roughly the following managed code:
-            //   ...
-            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
-            //   return Sse.MoveLowToHigh(tmp1, tmp2);
-
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-            tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16);
-            BlockRange().InsertAfter(op2, tmp2);
-            LowerNode(tmp2);
-
-            node->gtOp1 = tmp1;
-            node->gtOp2 = tmp2;
-
-            node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh;
-            node->gtSIMDBaseType  = TYP_FLOAT;
-
-            break;
-        }
-
-        default:
-        {
-            unreached();
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call
-//
-//  Arguments:
-//     node - The hardware intrinsic node.
-//
-void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
-{
-    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
-    ;
-    var_types baseType    = node->gtSIMDBaseType;
-    unsigned  simdSize    = node->gtSIMDSize;
-    var_types simdType    = Compiler::getSIMDTypeForSize(simdSize);
-    unsigned  simd16Count = comp->getSIMDVectorLength(16, baseType);
-
-    assert((intrinsicId == NI_Vector128_Dot) || (intrinsicId == NI_Vector256_Dot));
-    assert(varTypeIsSIMD(simdType));
-    assert(varTypeIsArithmetic(baseType));
-    assert(simdSize != 0);
-
-    GenTree* op1 = node->gtGetOp1();
-    GenTree* op2 = node->gtGetOp2();
-
-    assert(op1 != nullptr);
-    assert(op2 != nullptr);
-    assert(!op1->OperIsList());
-
-    // Spare GenTrees to be used for the lowering logic below
-    // Defined upfront to avoid naming conflicts, etc...
-    GenTree* idx  = nullptr;
-    GenTree* tmp1 = nullptr;
-    GenTree* tmp2 = nullptr;
-    GenTree* tmp3 = nullptr;
-
-    NamedIntrinsic multiply      = NI_Illegal;
-    NamedIntrinsic horizontalAdd = NI_Illegal;
-    NamedIntrinsic add           = NI_Illegal;
-    NamedIntrinsic shuffle       = NI_Illegal;
-
-    if (simdSize == 32)
-    {
-        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2));
-
-        switch (baseType)
-        {
-            case TYP_SHORT:
-            case TYP_USHORT:
-            case TYP_INT:
-            case TYP_UINT:
-            {
-                multiply      = NI_AVX2_MultiplyLow;
-                horizontalAdd = NI_AVX2_HorizontalAdd;
-                add           = NI_AVX2_Add;
-                break;
-            }
-
-            case TYP_FLOAT:
-            {
-                // We will be constructing the following parts:
-                //   idx  =    CNS_INT       int    0xF1
-                //          /--*  op1  simd16
-                //          +--*  op2  simd16
-                //          +--*  idx  int
-                //   tmp1 = *  HWINTRINSIC   simd16 T DotProduct
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //   idx  =    CNS_INT       int    0x01
-                //          /--*  tmp2 simd16
-                //          +--*  idx  int
-                //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   tmp3 = *  HWINTRINSIC   simd16 T Add
-                //          /--*  tmp3 simd16
-                //   node = *  HWINTRINSIC   simd16 T ToScalar
-
-                // This is roughly the following managed code:
-                //   var tmp1 = Avx.DotProduct(op1, op2, 0xFF);
-                //   var tmp2 = Avx.ExtractVector128(tmp1, 0x01);
-                //   var tmp3 = Sse.Add(tmp1, tmp2);
-                //   return tmp3.ToScalar();
-
-                idx = comp->gtNewIconNode(0xF1, TYP_INT);
-                BlockRange().InsertBefore(node, idx);
-
-                tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_AVX_DotProduct, baseType, simdSize);
-                BlockRange().InsertAfter(idx, tmp1);
-                LowerNode(tmp1);
-
-                node->gtOp1 = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->gtOp1;
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                idx = comp->gtNewIconNode(0x01, TYP_INT);
-                BlockRange().InsertAfter(tmp2, idx);
-
-                tmp2 =
-                    comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
-                BlockRange().InsertAfter(idx, tmp2);
-                LowerNode(tmp2);
-
-                tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_SSE_Add, baseType, 16);
-                BlockRange().InsertAfter(tmp2, tmp3);
-                LowerNode(tmp3);
-
-                node->gtSIMDSize = 16;
-
-                node->gtOp1 = tmp3;
-                node->gtOp2 = nullptr;
-
-                node->gtHWIntrinsicId = NI_Vector128_ToScalar;
-                LowerNode(node);
-
-                return;
-            }
-
-            case TYP_DOUBLE:
-            {
-                multiply      = NI_AVX_Multiply;
-                horizontalAdd = NI_AVX_HorizontalAdd;
-                add           = NI_AVX_Add;
-                break;
-            }
-
-            default:
-            {
-                unreached();
-            }
-        }
-    }
-    else
-    {
-        assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-        switch (baseType)
-        {
-            case TYP_SHORT:
-            case TYP_USHORT:
-            {
-                multiply      = NI_SSE2_MultiplyLow;
-                horizontalAdd = NI_SSSE3_HorizontalAdd;
-                add           = NI_SSE2_Add;
-
-                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3))
-                {
-                    shuffle = NI_SSE2_ShuffleLow;
-                }
-                break;
-            }
-
-            case TYP_INT:
-            case TYP_UINT:
-            {
-                multiply      = NI_SSE41_MultiplyLow;
-                horizontalAdd = NI_SSSE3_HorizontalAdd;
-                add           = NI_SSE2_Add;
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
-                break;
-            }
-
-            case TYP_FLOAT:
-            {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-                {
-                    // We will be constructing the following parts:
-                    //   idx  =    CNS_INT       int    0xFF
-                    //          /--*  op1  simd16
-                    //          +--*  op2  simd16
-                    //          +--*  idx  int
-                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
-                    //          /--*  tmp3 simd16
-                    //   node = *  HWINTRINSIC   simd16 T ToScalar
-
-                    // This is roughly the following managed code:
-                    //   var tmp3 = Avx.DotProduct(op1, op2, 0xFF);
-                    //   return tmp3.ToScalar();
-
-                    if (simdSize == 8)
-                    {
-                        idx = comp->gtNewIconNode(0x31, TYP_INT);
-                    }
-                    else if (simdSize == 12)
-                    {
-                        idx = comp->gtNewIconNode(0x71, TYP_INT);
-                    }
-                    else
-                    {
-                        assert(simdSize == 16);
-                        idx = comp->gtNewIconNode(0xF1, TYP_INT);
-                    }
-                    BlockRange().InsertBefore(node, idx);
-
-                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
-                                                          simdSize);
-                    BlockRange().InsertAfter(idx, tmp3);
-                    LowerNode(tmp3);
-
-                    node->gtOp1 = tmp3;
-                    node->gtOp2 = nullptr;
-
-                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
-                    LowerNode(node);
-
-                    return;
-                }
-
-                multiply      = NI_SSE_Multiply;
-                horizontalAdd = NI_SSE3_HorizontalAdd;
-                add           = NI_SSE_Add;
-
-                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
-                {
-                    shuffle = NI_SSE_Shuffle;
-                }
-                break;
-            }
-
-            case TYP_DOUBLE:
-            {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
-                {
-                    // We will be constructing the following parts:
-                    //   idx  =    CNS_INT       int    0x31
-                    //          /--*  op1  simd16
-                    //          +--*  op2  simd16
-                    //          +--*  idx  int
-                    //   tmp3 = *  HWINTRINSIC   simd16 T DotProduct
-                    //          /--*  tmp3 simd16
-                    //   node = *  HWINTRINSIC   simd16 T ToScalar
-
-                    // This is roughly the following managed code:
-                    //   var tmp3 = Avx.DotProduct(op1, op2, 0x31);
-                    //   return tmp3.ToScalar();
-
-                    idx = comp->gtNewIconNode(0x31, TYP_INT);
-                    BlockRange().InsertBefore(node, idx);
-
-                    tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE41_DotProduct, baseType,
-                                                          simdSize);
-                    BlockRange().InsertAfter(idx, tmp3);
-                    LowerNode(tmp3);
-
-                    node->gtOp1 = tmp3;
-                    node->gtOp2 = nullptr;
-
-                    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
-                    LowerNode(node);
-
-                    return;
-                }
-
-                multiply      = NI_SSE2_Multiply;
-                horizontalAdd = NI_SSE3_HorizontalAdd;
-                add           = NI_SSE2_Add;
-
-                if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
-                {
-                    shuffle = NI_SSE2_Shuffle;
-                }
-                break;
-            }
-
-            default:
-            {
-                unreached();
-            }
-        }
-
-        if (simdSize == 8)
-        {
-            assert(baseType == TYP_FLOAT);
-
-            // If simdSize == 8 then we have only two elements, not the 4 that we got from getSIMDVectorLength,
-            // which we gave a simdSize of 16. So, we set the simd16Count to 2 so that only 1 hadd will
-            // be emitted rather than 2, so that the upper two elements will be ignored.
-
-            simd16Count = 2;
-        }
-        else if (simdSize == 12)
-        {
-            assert(baseType == TYP_FLOAT);
-
-            // We will be constructing the following parts:
-            //   ...
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    -1
-            //          +--*  CNS_INT    int    0
-            //   tmp1 = *  HWINTRINSIC   simd16 T Create
-            //          /--*  op2 simd16
-            //          +--*  tmp1 simd16
-            //   op1  = *  HWINTRINSIC   simd16 T And
-            //   ...
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Vector128.Create(-1, -1, -1, 0);
-            //   op1  = Sse.And(op1, tmp2);
-            //   ...
-
-            GenTree* cns0 = comp->gtNewIconNode(-1, TYP_INT);
-            BlockRange().InsertAfter(op1, cns0);
-
-            GenTree* cns1 = comp->gtNewIconNode(-1, TYP_INT);
-            BlockRange().InsertAfter(cns0, cns1);
-
-            GenTree* cns2 = comp->gtNewIconNode(-1, TYP_INT);
-            BlockRange().InsertAfter(cns1, cns2);
-
-            GenTree* cns3 = comp->gtNewIconNode(0, TYP_INT);
-            BlockRange().InsertAfter(cns2, cns3);
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, cns0, cns1, cns2, cns3, NI_Vector128_Create, TYP_INT, 16);
-            BlockRange().InsertAfter(cns3, tmp1);
-            LowerNode(tmp1);
-
-            op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, tmp1, NI_SSE_And, baseType, simdSize);
-            BlockRange().InsertAfter(tmp1, op1);
-            LowerNode(op1);
-        }
-    }
-
-    // We will be constructing the following parts:
-    //          /--*  op1  simd16
-    //          +--*  op2  simd16
-    //   tmp1 = *  HWINTRINSIC   simd16 T Multiply
-    //   ...
-
-    // This is roughly the following managed code:
-    //   var tmp1 = Isa.Multiply(op1, op2);
-    //   ...
-
-    tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, baseType, simdSize);
-    BlockRange().InsertBefore(node, tmp1);
-    LowerNode(tmp1);
-
-    // HorizontalAdd combines pairs so we need log2(simd16Count) passes to sum all elements together.
-    int haddCount = genLog2(simd16Count);
-
-    for (int i = 0; i < haddCount; i++)
-    {
-        // We will be constructing the following parts:
-        //   ...
-        //          /--*  tmp1 simd16
-        //          *  STORE_LCL_VAR simd16
-        //   tmp1 =    LCL_VAR       simd16
-        //   tmp2 =    LCL_VAR       simd16
-        //   ...
-
-        // This is roughly the following managed code:
-        //   ...
-        //   tmp2 = tmp1;
-        //   ...
-
-        node->gtOp1 = tmp1;
-        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-        ReplaceWithLclVar(tmp1Use);
-        tmp1 = node->gtOp1;
-
-        tmp2 = comp->gtClone(tmp1);
-        BlockRange().InsertAfter(tmp1, tmp2);
-
-        if (shuffle == NI_Illegal)
-        {
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   tmp1 = *  HWINTRINSIC   simd16 T HorizontalAdd
-            //   ...
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Isa.HorizontalAdd(tmp1, tmp2);
-            //   ...
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, horizontalAdd, baseType, simdSize);
-        }
-        else
-        {
-            int shuffleConst = 0x00;
-
-            switch (i)
-            {
-                case 0:
-                {
-                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || varTypeIsFloating(baseType));
-
-                    // Adds (e0 + e1, e1 + e0, e2 + e3, e3 + e2), giving:
-                    //   e0, e1, e2, e3 | e4, e5, e6, e7
-                    //   e1, e0, e3, e2 | e5, e4, e7, e6
-                    //   ...
-
-                    shuffleConst = 0xB1;
-                    break;
-                }
-
-                case 1:
-                {
-                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT) || (baseType == TYP_FLOAT));
-
-                    // Adds (e0 + e2, e1 + e3, e2 + e0, e3 + e1), giving:
-                    //   ...
-                    //   e2, e3, e0, e1 | e6, e7, e4, e5
-                    //   e3, e2, e1, e0 | e7, e6, e5, e4
-
-                    shuffleConst = 0x4E;
-                    break;
-                }
-
-                case 2:
-                {
-                    assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
-
-                    // Adds (e0 + e4, e1 + e5, e2 + e6, e3 + e7), giving:
-                    //   ...
-                    //   e4, e5, e6, e7 | e0, e1, e2, e3
-                    //   e5, e4, e7, e6 | e1, e0, e3, e2
-                    //   e6, e7, e4, e5 | e2, e3, e0, e1
-                    //   e7, e6, e5, e4 | e3, e2, e1, e0
-
-                    shuffleConst = 0x4D;
-                    break;
-                }
-
-                default:
-                {
-                    unreached();
-                }
-            }
-
-            idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
-            BlockRange().InsertAfter(tmp2, idx);
-
-            if (varTypeIsFloating(baseType))
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp2 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //   tmp3 =    LCL_VAR       simd16
-                //   idx  =    CNS_INT       int    shuffleConst
-                //          /--*  tmp2 simd16
-                //          +--*  tmp3 simd16
-                //          +--*  idx  simd16
-                //   tmp2 = *  HWINTRINSIC   simd16 T Shuffle
-                //   ...
-
-                // This is roughly the following managed code:
-                //   ...
-                //   tmp3 = tmp2;
-                //   tmp2 = Isa.Shuffle(tmp2, tmp3, shuffleConst);
-                //   ...
-
-                node->gtOp1 = tmp2;
-                LIR::Use tmp2Use(BlockRange(), &node->gtOp1, node);
-                ReplaceWithLclVar(tmp2Use);
-                tmp2 = node->gtOp1;
-
-                tmp3 = comp->gtClone(tmp2);
-                BlockRange().InsertAfter(tmp2, tmp3);
-
-                tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, tmp3, idx, shuffle, baseType, simdSize);
-            }
-            else
-            {
-                assert((baseType == TYP_SHORT) || (baseType == TYP_USHORT));
-
-                if (i < 2)
-                {
-                    // We will be constructing the following parts:
-                    //   ...
-                    //   idx  =    CNS_INT       int    shuffleConst
-                    //          /--*  tmp2 simd16
-                    //          +--*  idx  simd16
-                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
-                    //   idx  =    CNS_INT       int    shuffleConst
-                    //          /--*  tmp2 simd16
-                    //          +--*  idx  simd16
-                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleHigh
-                    //   ...
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
-                    //   ...
-
-                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleLow, baseType, simdSize);
-                    BlockRange().InsertAfter(idx, tmp2);
-                    LowerNode(tmp2);
-
-                    idx = comp->gtNewIconNode(shuffleConst, TYP_INT);
-                    BlockRange().InsertAfter(tmp2, idx);
-
-                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_ShuffleHigh, baseType, simdSize);
-                }
-                else
-                {
-                    assert(i == 2);
-
-                    // We will be constructing the following parts:
-                    //   ...
-                    //   idx  =    CNS_INT       int    shuffleConst
-                    //          /--*  tmp2 simd16
-                    //          +--*  idx  simd16
-                    //   tmp2 = *  HWINTRINSIC   simd16 T ShuffleLow
-                    //   ...
-
-                    // This is roughly the following managed code:
-                    //   ...
-                    //   tmp2 = Isa.Shuffle(tmp1, shuffleConst);
-                    //   ...
-
-                    tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp2, idx, NI_SSE2_Shuffle, TYP_INT, simdSize);
-                }
-            }
-
-            BlockRange().InsertAfter(idx, tmp2);
-            LowerNode(tmp2);
-
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   tmp1 = *  HWINTRINSIC   simd16 T Add
-            //   ...
-
-            // This is roughly the following managed code:
-            //   ...
-            //   tmp1 = Isa.Add(tmp1, tmp2);
-            //   ...
-
-            tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, add, baseType, simdSize);
-        }
-
-        BlockRange().InsertAfter(tmp2, tmp1);
-        LowerNode(tmp1);
-    }
-
-    if (simdSize == 32)
-    {
-        // We will be constructing the following parts:
-        //   ...
-        //          /--*  tmp1 simd16
-        //          *  STORE_LCL_VAR simd16
-        //   tmp1 =    LCL_VAR       simd16
-        //   tmp2 =    LCL_VAR       simd16
-        //   idx  =    CNS_INT       int    0x01
-        //          /--*  tmp2 simd16
-        //          +--*  idx  int
-        //   tmp2 = *  HWINTRINSIC   simd16 T ExtractVector128
-        //          /--*  tmp1 simd16
-        //          +--*  tmp2 simd16
-        //   tmp1 = *  HWINTRINSIC   simd16 T Add
-        //   ...
-
-        // This is roughly the following managed code:
-        //   ...
-        //   var tmp2 = tmp1;
-        //       tmp2 = Avx.ExtractVector128(tmp2, 0x01);
-        //   var tmp1 = Isa.Add(tmp1, tmp2);
-        //   ...
-
-        node->gtOp1 = tmp1;
-        LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node);
-        ReplaceWithLclVar(tmp1Use);
-        tmp1 = node->gtOp1;
-
-        tmp2 = comp->gtClone(tmp1);
-        BlockRange().InsertAfter(tmp1, tmp2);
-
-        idx = comp->gtNewIconNode(0x01, TYP_INT);
-        BlockRange().InsertAfter(tmp2, idx);
-
-        tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, idx, NI_AVX_ExtractVector128, baseType, simdSize);
-        BlockRange().InsertAfter(idx, tmp2);
-        LowerNode(tmp2);
-
-        tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, add, baseType, 16);
-        BlockRange().InsertAfter(tmp2, tmp1);
-        LowerNode(tmp1);
-
-        node->gtSIMDSize = 16;
-    }
-
-    // We will be constructing the following parts:
-    //   ...
-    //          /--*  tmp1 simd16
-    //   node = *  HWINTRINSIC   simd16 T ToScalar
-
-    // This is roughly the following managed code:
-    //   ...
-    //   return tmp1.ToScalar();
-
-    node->gtOp1 = tmp1;
-    node->gtOp2 = nullptr;
-
-    node->gtHWIntrinsicId = NI_Vector128_ToScalar;
-    LowerNode(node);
-
-    return;
-}
-
-//----------------------------------------------------------------------------------------------
-// Lowering::LowerHWIntrinsicToScalar: Lowers a Vector128 or Vector256 ToScalar call
-//
-//  Arguments:
-//     node - The hardware intrinsic node.
-//
-void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
-{
-    NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
-    ;
-    var_types baseType = node->gtSIMDBaseType;
-    unsigned  simdSize = node->gtSIMDSize;
-    var_types simdType = Compiler::getSIMDTypeForSize(simdSize);
-
-    assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar));
-    assert(varTypeIsSIMD(simdType));
-    assert(varTypeIsArithmetic(baseType));
-    assert(simdSize != 0);
-
-    switch (baseType)
-    {
-        case TYP_BYTE:
-        case TYP_SHORT:
-        case TYP_INT:
-        {
-            node->gtType          = TYP_INT;
-            node->gtSIMDBaseType  = TYP_INT;
-            node->gtHWIntrinsicId = NI_SSE2_ConvertToInt32;
-            break;
-        }
-
-        case TYP_UBYTE:
-        case TYP_USHORT:
-        case TYP_UINT:
-        {
-            node->gtType          = TYP_UINT;
-            node->gtSIMDBaseType  = TYP_UINT;
-            node->gtHWIntrinsicId = NI_SSE2_ConvertToUInt32;
-            break;
-        }
-
-#if defined(TARGET_AMD64)
-        case TYP_LONG:
-        {
-            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToInt64;
-            break;
-        }
-
-        case TYP_ULONG:
-        {
-            node->gtHWIntrinsicId = NI_SSE2_X64_ConvertToUInt64;
-            break;
-        }
-#endif // TARGET_AMD64
-
-        case TYP_FLOAT:
-        case TYP_DOUBLE:
-        {
-            ContainCheckHWIntrinsic(node);
-            return;
-        }
-
-        default:
-        {
-            unreached();
-        }
-    }
-
-    LowerNode(node);
-
-    if (genTypeSize(baseType) < 4)
-    {
-        LIR::Use use;
-        bool     foundUse = BlockRange().TryGetUse(node, &use);
-
-        GenTreeCast* cast = comp->gtNewCastNode(baseType, node, node->IsUnsigned(), baseType);
-        BlockRange().InsertAfter(node, cast);
-
-        if (foundUse)
-        {
-            use.ReplaceWith(comp, cast);
-        }
-        LowerNode(cast);
-    }
-}
-#endif // FEATURE_HW_INTRINSICS
-
-//----------------------------------------------------------------------------------------------
-// Lowering::IsRMWIndirCandidate:
-//    Returns true if the given operand is a candidate indirection for a read-modify-write
-//    operator.
-//
-//  Arguments:
-//     operand - The operand to consider.
-//     storeInd - The indirect store that roots the possible RMW operator.
-//
-bool Lowering::IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd)
-{
-    // If the operand isn't an indirection, it's trivially not a candidate.
-    if (operand->OperGet() != GT_IND)
-    {
-        return false;
-    }
-
-    // If the indirection's source address isn't equivalent to the destination address of the storeIndir, then the
-    // indirection is not a candidate.
-    GenTree* srcAddr = operand->gtGetOp1();
-    GenTree* dstAddr = storeInd->gtGetOp1();
-    if ((srcAddr->OperGet() != dstAddr->OperGet()) || !IndirsAreEquivalent(operand, storeInd))
-    {
-        return false;
-    }
-
-    // If it is not safe to contain the entire tree rooted at the indirection, then the indirection is not a
-    // candidate. Crawl the IR from the node immediately preceding the storeIndir until the last node in the
-    // indirection's tree is visited and check the side effects at each point.
-
-    m_scratchSideEffects.Clear();
-
-    assert((operand->gtLIRFlags & LIR::Flags::Mark) == 0);
-    operand->gtLIRFlags |= LIR::Flags::Mark;
-
-    unsigned markCount = 1;
-    GenTree* node;
-    for (node = storeInd->gtPrev; markCount > 0; node = node->gtPrev)
-    {
-        assert(node != nullptr);
-
-        if ((node->gtLIRFlags & LIR::Flags::Mark) == 0)
-        {
-            m_scratchSideEffects.AddNode(comp, node);
-        }
-        else
-        {
-            node->gtLIRFlags &= ~LIR::Flags::Mark;
-            markCount--;
-
-            if (m_scratchSideEffects.InterferesWith(comp, node, false))
-            {
-                // The indirection's tree contains some node that can't be moved to the storeInder. The indirection is
-                // not a candidate. Clear any leftover mark bits and return.
-                for (; markCount > 0; node = node->gtPrev)
-                {
-                    if ((node->gtLIRFlags & LIR::Flags::Mark) != 0)
-                    {
-                        node->gtLIRFlags &= ~LIR::Flags::Mark;
-                        markCount--;
-                    }
-                }
-                return false;
-            }
-
-            node->VisitOperands([&markCount](GenTree* nodeOperand) -> GenTree::VisitResult {
-                assert((nodeOperand->gtLIRFlags & LIR::Flags::Mark) == 0);
-                nodeOperand->gtLIRFlags |= LIR::Flags::Mark;
-                markCount++;
-                return GenTree::VisitResult::Continue;
-            });
-        }
-    }
-
-    // At this point we've verified that the operand is an indirection, its address is equivalent to the storeIndir's
-    // destination address, and that it and the transitive closure of its operand can be safely contained by the
-    // storeIndir. This indirection is therefore a candidate for an RMW op.
-    return true;
-}
-
-//----------------------------------------------------------------------------------------------
-// Returns true if this tree is bin-op of a GT_STOREIND of the following form
-//      storeInd(subTreeA, binOp(gtInd(subTreeA), subtreeB)) or
-//      storeInd(subTreeA, binOp(subtreeB, gtInd(subTreeA)) in case of commutative bin-ops
-//
-// The above form for storeInd represents a read-modify-write memory binary operation.
-//
-// Parameters
-//     tree   -   GentreePtr of binOp
-//
-// Return Value
-//     True if 'tree' is part of a RMW memory operation pattern
-//
-bool Lowering::IsBinOpInRMWStoreInd(GenTree* tree)
-{
-    // Must be a non floating-point type binary operator since SSE2 doesn't support RMW memory ops
-    assert(!varTypeIsFloating(tree));
-    assert(GenTree::OperIsBinary(tree->OperGet()));
-
-    // Cheap bail out check before more expensive checks are performed.
-    // RMW memory op pattern requires that one of the operands of binOp to be GT_IND.
-    if (tree->gtGetOp1()->OperGet() != GT_IND && tree->gtGetOp2()->OperGet() != GT_IND)
-    {
-        return false;
-    }
-
-    LIR::Use use;
-    if (!BlockRange().TryGetUse(tree, &use) || use.User()->OperGet() != GT_STOREIND || use.User()->gtGetOp2() != tree)
-    {
-        return false;
-    }
-
-    // Since it is not relatively cheap to recognize RMW memory op pattern, we
-    // cache the result in GT_STOREIND node so that while lowering GT_STOREIND
-    // we can use the result.
-    GenTree* indirCandidate = nullptr;
-    GenTree* indirOpSource  = nullptr;
-    return IsRMWMemOpRootedAtStoreInd(use.User(), &indirCandidate, &indirOpSource);
-}
-
-//----------------------------------------------------------------------------------------------
-// This method recognizes the case where we have a treeNode with the following structure:
-//         storeInd(IndirDst, binOp(gtInd(IndirDst), indirOpSource)) OR
-//         storeInd(IndirDst, binOp(indirOpSource, gtInd(IndirDst)) in case of commutative operations OR
-//         storeInd(IndirDst, unaryOp(gtInd(IndirDst)) in case of unary operations
-//
-// Terminology:
-//         indirDst = memory write of an addr mode  (i.e. storeind destination)
-//         indirSrc = value being written to memory (i.e. storeind source which could either be a binary or unary op)
-//         indirCandidate = memory read i.e. a gtInd of an addr mode
-//         indirOpSource = source operand used in binary/unary op (i.e. source operand of indirSrc node)
-//
-// In x86/x64 this storeInd pattern can be effectively encoded in a single instruction of the
-// following form in case of integer operations:
-//         binOp [addressing mode], RegIndirOpSource
-//         binOp [addressing mode], immediateVal
-// where RegIndirOpSource is the register where indirOpSource was computed.
-//
-// Right now, we recognize few cases:
-//     a) The gtInd child is a lea/lclVar/lclVarAddr/clsVarAddr/constant
-//     b) BinOp is either add, sub, xor, or, and, shl, rsh, rsz.
-//     c) unaryOp is either not/neg
-//
-// Implementation Note: The following routines need to be in sync for RMW memory op optimization
-// to be correct and functional.
-//     IndirsAreEquivalent()
-//     NodesAreEquivalentLeaves()
-//     Codegen of GT_STOREIND and genCodeForShiftRMW()
-//     emitInsRMW()
-//
-//  TODO-CQ: Enable support for more complex indirections (if needed) or use the value numbering
-//  package to perform more complex tree recognition.
-//
-//  TODO-XArch-CQ: Add support for RMW of lcl fields (e.g. lclfield binop= source)
-//
-//  Parameters:
-//     tree               -  GT_STOREIND node
-//     outIndirCandidate  -  out param set to indirCandidate as described above
-//     ouutIndirOpSource  -  out param set to indirOpSource as described above
-//
-//  Return value
-//     True if there is a RMW memory operation rooted at a GT_STOREIND tree
-//     and out params indirCandidate and indirOpSource are set to non-null values.
-//     Otherwise, returns false with indirCandidate and indirOpSource set to null.
-//     Also updates flags of GT_STOREIND tree with its RMW status.
-//
-bool Lowering::IsRMWMemOpRootedAtStoreInd(GenTree* tree, GenTree** outIndirCandidate, GenTree** outIndirOpSource)
-{
-    assert(false);
-    return true;
-}
-
-// anything is in range for AMD64
-bool Lowering::IsCallTargetInRange(void* addr)
-{
-    return true;
-}
-
-// return true if the immediate can be folded into an instruction, for example small enough and non-relocatable
-bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode) const
-{
-    if (!childNode->IsIntCnsFitsInI32())
-    {
-        return false;
-    }
-
-    // At this point we know that it is an int const fits within 4-bytes and hence can safely cast to IntConCommon.
-    // Icons that need relocation should never be marked as contained immed
-    if (childNode->AsIntConCommon()->ImmedValNeedsReloc(comp))
-    {
-        return false;
-    }
-
-    return true;
-}
-
-//-----------------------------------------------------------------------
-// PreferredRegOptionalOperand: returns one of the operands of given
-// binary oper that is to be preferred for marking as reg optional.
-//
-// Since only one of op1 or op2 can be a memory operand on xarch, only
-// one of  them have to be marked as reg optional.  Since Lower doesn't
-// know apriori which of op1 or op2 is not likely to get a register, it
-// has to make a guess. This routine encapsulates heuristics that
-// guess whether it is likely to be beneficial to mark op1 or op2 as
-// reg optional.
-//
-//
-// Arguments:
-//     tree  -  a binary-op tree node that is either commutative
-//              or a compare oper.
-//
-// Returns:
-//     Returns op1 or op2 of tree node that is preferred for
-//     marking as reg optional.
-//
-// Note: if the tree oper is neither commutative nor a compare oper
-// then only op2 can be reg optional on xarch and hence no need to
-// call this routine.
-GenTree* Lowering::PreferredRegOptionalOperand(GenTree* tree)
-{
-    assert(false);
-
-    return NULL;
-}
-
-//------------------------------------------------------------------------
-// Containment analysis
-//------------------------------------------------------------------------
-
-//------------------------------------------------------------------------
-// ContainCheckCallOperands: Determine whether operands of a call should be contained.
-//
-// Arguments:
-//    call       - The call node of interest
-//
-// Return Value:
-//    None.
-//
-void Lowering::ContainCheckCallOperands(GenTreeCall* call)
-{
-    GenTree* ctrlExpr = call->gtControlExpr;
-    if (call->gtCallType == CT_INDIRECT)
-    {
-        // either gtControlExpr != null or gtCallAddr != null.
-        // Both cannot be non-null at the same time.
-        assert(ctrlExpr == nullptr);
-        assert(call->gtCallAddr != nullptr);
-        ctrlExpr = call->gtCallAddr;
-
-#ifdef TARGET_X86
-        // Fast tail calls aren't currently supported on x86, but if they ever are, the code
-        // below that handles indirect VSD calls will need to be fixed.
-        assert(!call->IsFastTailCall() || !call->IsVirtualStub());
-#endif // TARGET_X86
-    }
-
-    // set reg requirements on call target represented as control sequence.
-    if (ctrlExpr != nullptr)
-    {
-        // we should never see a gtControlExpr whose type is void.
-        assert(ctrlExpr->TypeGet() != TYP_VOID);
-
-        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
-        // computed into a register.
-        if (!call->IsFastTailCall())
-        {
-#ifdef TARGET_X86
-            // On x86, we need to generate a very specific pattern for indirect VSD calls:
-            //
-            //    3-byte nop
-            //    call dword ptr [eax]
-            //
-            // Where EAX is also used as an argument to the stub dispatch helper. Make
-            // sure that the call target address is computed into EAX in this case.
-            if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
-            {
-                assert(ctrlExpr->isIndir());
-                MakeSrcContained(call, ctrlExpr);
-            }
-            else
-#endif // TARGET_X86
-                if (ctrlExpr->isIndir())
-            {
-                // We may have cases where we have set a register target on the ctrlExpr, but if it
-                // contained we must clear it.
-                ctrlExpr->SetRegNum(REG_NA);
-                MakeSrcContained(call, ctrlExpr);
-            }
-        }
-    }
-
-    for (GenTreeCall::Use& use : call->Args())
-    {
-        if (use.GetNode()->OperIs(GT_PUTARG_STK))
-        {
-            LowerPutArgStk(use.GetNode()->AsPutArgStk());
-        }
-    }
-
-    for (GenTreeCall::Use& use : call->LateArgs())
-    {
-        if (use.GetNode()->OperIs(GT_PUTARG_STK))
-        {
-            LowerPutArgStk(use.GetNode()->AsPutArgStk());
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// ContainCheckIndir: Determine whether operands of an indir should be contained.
-//
-// Arguments:
-//    node       - The indirection node of interest
-//
-// Notes:
-//    This is called for both store and load indirections. In the former case, it is assumed that
-//    LowerStoreIndir() has already been called to check for RMW opportunities.
-//
-// Return Value:
-//    None.
-//
-void Lowering::ContainCheckIndir(GenTreeIndir* node)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// ContainCheckStoreIndir: determine whether the sources of a STOREIND node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckStoreIndir(GenTreeIndir* node)
-{
-    // If the source is a containable immediate, make it contained, unless it is
-    // an int-size or larger store of zero to memory, because we can generate smaller code
-    // by zeroing a register and then storing it.
-    GenTree* src = node->AsOp()->gtOp2;
-    if (IsContainableImmed(node, src) &&
-        (!src->IsIntegralConst(0) || varTypeIsSmall(node) || node->gtGetOp1()->OperGet() == GT_CLS_VAR_ADDR))
-    {
-        MakeSrcContained(node, src);
-    }
-    ContainCheckIndir(node);
-}
-
-//------------------------------------------------------------------------
-// ContainCheckMul: determine whether the sources of a MUL node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckMul(GenTreeOp* node)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// ContainCheckDivOrMod: determine which operands of a div/mod should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckDivOrMod(GenTreeOp* node)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// ContainCheckShiftRotate: determine whether the sources of a shift/rotate node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckShiftRotate(GenTreeOp* node)
-{
-    assert(node->OperIsShiftOrRotate());
-#ifdef TARGET_X86
-    GenTree* source = node->gtOp1;
-    if (node->OperIsShiftLong())
-    {
-        assert(source->OperGet() == GT_LONG);
-        MakeSrcContained(node, source);
-    }
-#endif // !TARGET_X86
-
-    GenTree* shiftBy = node->gtOp2;
-    if (IsContainableImmed(node, shiftBy) && (shiftBy->AsIntConCommon()->IconValue() <= 255) &&
-        (shiftBy->AsIntConCommon()->IconValue() >= 0))
-    {
-        MakeSrcContained(node, shiftBy);
-    }
-}
-
-//------------------------------------------------------------------------
-// ContainCheckStoreLoc: determine whether the source of a STORE_LCL* should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckStoreLoc(GenTreeLclVarCommon* storeLoc) const
-{
-    assert(storeLoc->OperIsLocalStore());
-    GenTree* op1 = storeLoc->gtGetOp1();
-
-    if (op1->OperIs(GT_BITCAST))
-    {
-        // If we know that the source of the bitcast will be in a register, then we can make
-        // the bitcast itself contained. This will allow us to store directly from the other
-        // type if this node doesn't get a register.
-        GenTree* bitCastSrc = op1->gtGetOp1();
-        if (!bitCastSrc->isContained() && !bitCastSrc->IsRegOptional())
-        {
-            op1->SetContained();
-            return;
-        }
-    }
-
-    const LclVarDsc* varDsc = comp->lvaGetDesc(storeLoc);
-
-#ifdef FEATURE_SIMD
-    if (varTypeIsSIMD(storeLoc))
-    {
-        assert(!op1->IsCnsIntOrI());
-        if (storeLoc->TypeIs(TYP_SIMD12) && op1->IsSIMDZero() && varDsc->lvDoNotEnregister)
-        {
-            // For a SIMD12 store we can zero from integer registers more easily.
-            MakeSrcContained(storeLoc, op1);
-            GenTree* constNode = op1->gtGetOp1();
-            assert(constNode->OperIsConst());
-            constNode->ClearContained();
-            constNode->gtType = TYP_INT;
-            constNode->SetOper(GT_CNS_INT);
-        }
-        return;
-    }
-#endif // FEATURE_SIMD
-
-    // If the source is a containable immediate, make it contained, unless it is
-    // an int-size or larger store of zero to memory, because we can generate smaller code
-    // by zeroing a register and then storing it.
-    var_types type = varDsc->GetRegisterType(storeLoc);
-    if (IsContainableImmed(storeLoc, op1) && (!op1->IsIntegralConst(0) || varTypeIsSmall(type)))
-    {
-        MakeSrcContained(storeLoc, op1);
-    }
-#ifdef TARGET_X86
-    else if (op1->OperGet() == GT_LONG)
-    {
-        MakeSrcContained(storeLoc, op1);
-    }
-#endif // TARGET_X86
-}
-
-//------------------------------------------------------------------------
-// ContainCheckCast: determine whether the source of a CAST node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckCast(GenTreeCast* node)
-{
-    GenTree*  castOp     = node->CastOp();
-    var_types castToType = node->CastToType();
-    var_types srcType    = castOp->TypeGet();
-
-    // force the srcType to unsigned if GT_UNSIGNED flag is set
-    if (node->gtFlags & GTF_UNSIGNED)
-    {
-        srcType = genUnsignedType(srcType);
-    }
-
-    if (!node->gtOverflow() && (varTypeIsFloating(castToType) || varTypeIsFloating(srcType)))
-    {
-#ifdef DEBUG
-        // If converting to float/double, the operand must be 4 or 8 byte in size.
-        if (varTypeIsFloating(castToType))
-        {
-            unsigned opSize = genTypeSize(srcType);
-            assert(opSize == 4 || opSize == 8);
-        }
-#endif // DEBUG
-
-        // U8 -> R8 conversion requires that the operand be in a register.
-        if (srcType != TYP_ULONG)
-        {
-            if (IsContainableMemoryOp(castOp) || castOp->IsCnsNonZeroFltOrDbl())
-            {
-                MakeSrcContained(node, castOp);
-            }
-            else
-            {
-                // Mark castOp as reg optional to indicate codegen
-                // can still generate code if it is on stack.
-                castOp->SetRegOptional();
-            }
-        }
-    }
-#if !defined(TARGET_64BIT)
-    if (varTypeIsLong(srcType))
-    {
-        noway_assert(castOp->OperGet() == GT_LONG);
-        castOp->SetContained();
-    }
-#endif // !defined(TARGET_64BIT)
-}
-
-//------------------------------------------------------------------------
-// ContainCheckCompare: determine whether the sources of a compare node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckCompare(GenTreeOp* cmp)
-{
-    assert(cmp->OperIsCompare() || cmp->OperIs(GT_CMP));
-
-    GenTree*  op1     = cmp->AsOp()->gtOp1;
-    GenTree*  op2     = cmp->AsOp()->gtOp2;
-    var_types op1Type = op1->TypeGet();
-    var_types op2Type = op2->TypeGet();
-
-    // If either of op1 or op2 is floating point values, then we need to use
-    // ucomiss or ucomisd to compare, both of which support the following form:
-    //     ucomis[s|d] xmm, xmm/mem
-    // That is only the second operand can be a memory op.
-    //
-    // Second operand is a memory Op:  Note that depending on comparison operator,
-    // the operands of ucomis[s|d] need to be reversed.  Therefore, either op1 or
-    // op2 can be a memory op depending on the comparison operator.
-    if (varTypeIsFloating(op1Type))
-    {
-        // The type of the operands has to be the same and no implicit conversions at this stage.
-        assert(op1Type == op2Type);
-
-        GenTree* otherOp;
-        if (GenCondition::FromFloatRelop(cmp).PreferSwap())
-        {
-            otherOp = op1;
-        }
-        else
-        {
-            otherOp = op2;
-        }
-
-        assert(otherOp != nullptr);
-        bool isSafeToContainOtherOp = true;
-        if (otherOp->IsCnsNonZeroFltOrDbl())
-        {
-            MakeSrcContained(cmp, otherOp);
-        }
-        else if (IsContainableMemoryOp(otherOp))
-        {
-            isSafeToContainOtherOp = IsSafeToContainMem(cmp, otherOp);
-            if (isSafeToContainOtherOp)
-            {
-                MakeSrcContained(cmp, otherOp);
-            }
-        }
-
-        if (!otherOp->isContained() && isSafeToContainOtherOp && IsSafeToContainMem(cmp, otherOp))
-        {
-            // SSE2 allows only otherOp to be a memory-op. Since otherOp is not
-            // contained, we can mark it reg-optional.
-            // IsSafeToContainMem is expensive so we call it at most once for otherOp.
-            // If we already called IsSafeToContainMem, it must have returned false;
-            // otherwise, otherOp would be contained.
-            otherOp->SetRegOptional();
-        }
-
-        return;
-    }
-
-    // TODO-XArch-CQ: factor out cmp optimization in 'genCondSetFlags' to be used here
-    // or in other backend.
-
-    if (CheckImmedAndMakeContained(cmp, op2))
-    {
-        // If the types are the same, or if the constant is of the correct size,
-        // we can treat the MemoryOp as contained.
-        if (op1Type == op2Type)
-        {
-            if (IsContainableMemoryOp(op1))
-            {
-                MakeSrcContained(cmp, op1);
-            }
-            else
-            {
-                op1->SetRegOptional();
-            }
-        }
-    }
-    else if (op1Type == op2Type)
-    {
-        // Note that TEST does not have a r,rm encoding like CMP has but we can still
-        // contain the second operand because the emitter maps both r,rm and rm,r to
-        // the same instruction code. This avoids the need to special case TEST here.
-
-        bool isSafeToContainOp1 = true;
-        bool isSafeToContainOp2 = true;
-
-        if (IsContainableMemoryOp(op2))
-        {
-            isSafeToContainOp2 = IsSafeToContainMem(cmp, op2);
-            if (isSafeToContainOp2)
-            {
-                MakeSrcContained(cmp, op2);
-            }
-        }
-
-        if (!op2->isContained() && IsContainableMemoryOp(op1))
-        {
-            isSafeToContainOp1 = IsSafeToContainMem(cmp, op1);
-            if (isSafeToContainOp1)
-            {
-                MakeSrcContained(cmp, op1);
-            }
-        }
-
-        if (!op1->isContained() && !op2->isContained())
-        {
-            // One of op1 or op2 could be marked as reg optional
-            // to indicate that codegen can still generate code
-            // if one of them is on stack.
-            GenTree* regOptionalCandidate = op1->IsCnsIntOrI() ? op2 : PreferredRegOptionalOperand(cmp);
-
-            // IsSafeToContainMem is expensive so we call it at most once for each operand
-            // in this method. If we already called IsSafeToContainMem, it must have returned false;
-            // otherwise, the corresponding operand (op1 or op2) would be contained.
-            bool setRegOptional = (regOptionalCandidate == op1) ? isSafeToContainOp1 && IsSafeToContainMem(cmp, op1)
-                                                                : isSafeToContainOp2 && IsSafeToContainMem(cmp, op2);
-            if (setRegOptional)
-            {
-                regOptionalCandidate->SetRegOptional();
-            }
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// LowerRMWMemOp: Determine if this is a valid RMW mem op, and if so lower it accordingly
-//
-// Arguments:
-//    node       - The indirect store node (GT_STORE_IND) of interest
-//
-// Return Value:
-//    Returns true if 'node' is a valid RMW mem op; false otherwise.
-//
-bool Lowering::LowerRMWMemOp(GenTreeIndir* storeInd)
-{
-    assert(false);
-    return false;
-}
-
-//------------------------------------------------------------------------
-// ContainCheckBinary: Determine whether a binary op's operands should be contained.
-//
-// Arguments:
-//    node - the node we care about
-//
-void Lowering::ContainCheckBinary(GenTreeOp* node)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// ContainCheckBoundsChk: determine whether any source of a bounds check node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckBoundsChk(GenTreeBoundsChk* node)
-{
-    assert(node->OperIsBoundsCheck());
-    GenTree* other;
-    if (CheckImmedAndMakeContained(node, node->gtIndex))
-    {
-        other = node->gtArrLen;
-    }
-    else if (CheckImmedAndMakeContained(node, node->gtArrLen))
-    {
-        other = node->gtIndex;
-    }
-    else if (IsContainableMemoryOp(node->gtIndex))
-    {
-        other = node->gtIndex;
-    }
-    else
-    {
-        other = node->gtArrLen;
-    }
-
-    if (node->gtIndex->TypeGet() == node->gtArrLen->TypeGet())
-    {
-        if (IsContainableMemoryOp(other))
-        {
-            MakeSrcContained(node, other);
-        }
-        else
-        {
-            // We can mark 'other' as reg optional, since it is not contained.
-            other->SetRegOptional();
-        }
-    }
-}
-
-//------------------------------------------------------------------------
-// ContainCheckIntrinsic: determine whether the source of an INTRINSIC node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckIntrinsic(GenTreeOp* node)
-{
-    assert(node->OperIs(GT_INTRINSIC));
-
-    NamedIntrinsic intrinsicName = node->AsIntrinsic()->gtIntrinsicName;
-
-    if (intrinsicName == NI_System_Math_Sqrt || intrinsicName == NI_System_Math_Round ||
-        intrinsicName == NI_System_Math_Ceiling || intrinsicName == NI_System_Math_Floor)
-    {
-        GenTree* op1 = node->gtGetOp1();
-        if (IsContainableMemoryOp(op1) || op1->IsCnsNonZeroFltOrDbl())
-        {
-            MakeSrcContained(node, op1);
-        }
-        else
-        {
-            // Mark the operand as reg optional since codegen can still
-            // generate code if op1 is on stack.
-            op1->SetRegOptional();
-        }
-    }
-}
-
-#ifdef FEATURE_SIMD
-//----------------------------------------------------------------------------------------------
-// ContainCheckSIMD: Perform containment analysis for a SIMD intrinsic node.
-//
-//  Arguments:
-//     simdNode - The SIMD intrinsic node.
-//
-void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
-{
-    switch (simdNode->gtSIMDIntrinsicID)
-    {
-        GenTree* op1;
-        GenTree* op2;
-
-        case SIMDIntrinsicInit:
-        {
-            op1 = simdNode->AsOp()->gtOp1;
-#ifndef TARGET_64BIT
-            if (op1->OperGet() == GT_LONG)
-            {
-                MakeSrcContained(simdNode, op1);
-                GenTree* op1lo = op1->gtGetOp1();
-                GenTree* op1hi = op1->gtGetOp2();
-
-                if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
-                    (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
-                {
-                    MakeSrcContained(op1, op1lo);
-                    MakeSrcContained(op1, op1hi);
-                }
-            }
-            else
-#endif // !TARGET_64BIT
-                if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
-                    (varTypeIsIntegral(simdNode->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
-            {
-                MakeSrcContained(simdNode, op1);
-            }
-            else if ((comp->getSIMDSupportLevel() == SIMD_AVX2_Supported) &&
-                     ((simdNode->gtSIMDSize == 16) || (simdNode->gtSIMDSize == 32)))
-            {
-                // Either op1 is a float or dbl constant or an addr
-                if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
-                {
-                    MakeSrcContained(simdNode, op1);
-                }
-            }
-        }
-        break;
-
-        case SIMDIntrinsicInitArray:
-            // We have an array and an index, which may be contained.
-            CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
-            break;
-
-        case SIMDIntrinsicGetItem:
-        {
-            // This implements get_Item method. The sources are:
-            //  - the source SIMD struct
-            //  - index (which element to get)
-            // The result is baseType of SIMD struct.
-            op1 = simdNode->AsOp()->gtOp1;
-            op2 = simdNode->AsOp()->gtOp2;
-
-            if (op1->OperGet() == GT_IND)
-            {
-                assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
-                op1->AsIndir()->Addr()->ClearContained();
-            }
-            // If the index is a constant, mark it as contained.
-            CheckImmedAndMakeContained(simdNode, op2);
-
-            if (IsContainableMemoryOp(op1))
-            {
-                MakeSrcContained(simdNode, op1);
-                if (op1->OperGet() == GT_IND)
-                {
-                    op1->AsIndir()->Addr()->ClearContained();
-                }
-            }
-        }
-        break;
-
-        case SIMDIntrinsicShuffleSSE2:
-            // Second operand is an integer constant and marked as contained.
-            assert(simdNode->AsOp()->gtOp2->IsCnsIntOrI());
-            MakeSrcContained(simdNode, simdNode->AsOp()->gtOp2);
-            break;
-
-        default:
-            break;
-    }
-}
-#endif // FEATURE_SIMD
-
-#ifdef FEATURE_HW_INTRINSICS
-//----------------------------------------------------------------------------------------------
-// IsContainableHWIntrinsicOp: Return true if 'node' is a containable HWIntrinsic op.
-//
-//  Arguments:
-//     containingNode - The hardware intrinsic node which contains 'node'
-//     node - The node to check
-//     [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands;
-//     otherwise, false.
-//
-// Return Value:
-//    true if 'node' is a containable hardware intrinsic node; otherwise, false.
-//
-bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, GenTree* node, bool* supportsRegOptional)
-{
-    NamedIntrinsic      containingIntrinsicId = containingNode->gtHWIntrinsicId;
-    HWIntrinsicCategory category              = HWIntrinsicInfo::lookupCategory(containingIntrinsicId);
-
-    // We shouldn't have called in here if containingNode doesn't support containment
-    assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId));
-
-    // containingNode supports nodes that read from an aligned memory address
-    //
-    // This will generally be an explicit LoadAligned instruction and is false for
-    // machines with VEX support when minOpts is enabled. This is because there is
-    // currently no way to guarantee that the address read from will always be
-    // aligned and we want to assert that the address is aligned when optimizations
-    // aren't enabled. However, when optimizations are enabled, we want to allow
-    // folding of memory operands as it produces better codegen and allows simpler
-    // coding patterns on the managed side.
-    bool supportsAlignedSIMDLoads = false;
-
-    // containingNode supports nodes that read from general memory
-    //
-    // We currently have to assume all "general" loads are unaligned. As such, this is
-    // generally used to determine if we can mark the node as `regOptional` in the case
-    // where `node` is not containable. However, this can also be used to determine whether
-    // we can mark other types of reads as contained (such as when directly reading a local).
-    bool supportsGeneralLoads = false;
-
-    // containingNode supports nodes that read from a scalar memory address
-    //
-    // This will generally be an explicit LoadScalar instruction but is also used to determine
-    // whether we can read an address of type T (we don't support this when the load would
-    // read more than sizeof(T) bytes).
-    bool supportsSIMDScalarLoads = false;
-
-    // containingNode supports nodes that read from an unaligned memory address
-    //
-    // This will generally be an explicit Load instruction and is generally false for machines
-    // without VEX support. This is because older hardware required that the SIMD operand always
-    // be aligned to the 'natural alignment' of the type.
-    bool supportsUnalignedSIMDLoads = false;
-
-    switch (category)
-    {
-        case HW_Category_MemoryLoad:
-            supportsGeneralLoads = (!node->OperIsHWIntrinsic());
-            break;
-
-        case HW_Category_SimpleSIMD:
-        {
-            switch (containingIntrinsicId)
-            {
-                case NI_SSE41_ConvertToVector128Int16:
-                case NI_SSE41_ConvertToVector128Int32:
-                case NI_SSE41_ConvertToVector128Int64:
-                case NI_AVX2_ConvertToVector256Int16:
-                case NI_AVX2_ConvertToVector256Int32:
-                case NI_AVX2_ConvertToVector256Int64:
-                {
-                    supportsGeneralLoads = (!node->OperIsHWIntrinsic());
-                    break;
-                }
-
-                default:
-                {
-                    // These intrinsics only expect 16 or 32-byte nodes for containment
-                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
-
-                    if (!comp->canUseVexEncoding())
-                    {
-                        // Most instructions under the non-VEX encoding require aligned operands.
-                        // Those used for Sse2.ConvertToVector128Double (CVTDQ2PD and CVTPS2PD)
-                        // are exceptions and don't fail for unaligned inputs.
-
-                        supportsAlignedSIMDLoads   = (containingIntrinsicId != NI_SSE2_ConvertToVector128Double);
-                        supportsUnalignedSIMDLoads = !supportsAlignedSIMDLoads;
-                    }
-                    else
-                    {
-                        supportsAlignedSIMDLoads   = !comp->opts.MinOpts();
-                        supportsUnalignedSIMDLoads = true;
-                    }
-
-                    supportsGeneralLoads = supportsUnalignedSIMDLoads;
-                    break;
-                }
-            }
-
-            assert(supportsSIMDScalarLoads == false);
-            break;
-        }
-
-        case HW_Category_IMM:
-        {
-            switch (containingIntrinsicId)
-            {
-                case NI_SSE_Shuffle:
-                case NI_SSE2_ShiftLeftLogical:
-                case NI_SSE2_ShiftRightArithmetic:
-                case NI_SSE2_ShiftRightLogical:
-                case NI_SSE2_Shuffle:
-                case NI_SSE2_ShuffleHigh:
-                case NI_SSE2_ShuffleLow:
-                case NI_SSSE3_AlignRight:
-                case NI_SSE41_Blend:
-                case NI_SSE41_DotProduct:
-                case NI_SSE41_MultipleSumAbsoluteDifferences:
-                case NI_AES_KeygenAssist:
-                case NI_PCLMULQDQ_CarrylessMultiply:
-                case NI_AVX_Blend:
-                case NI_AVX_Compare:
-                case NI_AVX_DotProduct:
-                case NI_AVX_InsertVector128:
-                case NI_AVX_Permute:
-                case NI_AVX_Permute2x128:
-                case NI_AVX2_Blend:
-                case NI_AVX2_InsertVector128:
-                case NI_AVX2_MultipleSumAbsoluteDifferences:
-                case NI_AVX2_Permute2x128:
-                case NI_AVX2_Permute4x64:
-                case NI_AVX2_ShiftLeftLogical:
-                case NI_AVX2_ShiftRightArithmetic:
-                case NI_AVX2_ShiftRightLogical:
-                case NI_AVX2_ShuffleHigh:
-                case NI_AVX2_ShuffleLow:
-                {
-                    // These intrinsics only expect 16 or 32-byte nodes for containment
-                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
-                    assert(supportsSIMDScalarLoads == false);
-
-                    supportsAlignedSIMDLoads   = !comp->canUseVexEncoding() || !comp->opts.MinOpts();
-                    supportsUnalignedSIMDLoads = comp->canUseVexEncoding();
-                    supportsGeneralLoads       = supportsUnalignedSIMDLoads;
-
-                    break;
-                }
-
-                case NI_SSE2_Insert:
-                case NI_SSE41_Insert:
-                case NI_SSE41_X64_Insert:
-                {
-                    if (containingNode->gtSIMDBaseType == TYP_FLOAT)
-                    {
-                        assert(containingIntrinsicId == NI_SSE41_Insert);
-                        assert(genTypeSize(node->TypeGet()) == 16);
-
-                        // Sse41.Insert(V128<float>, V128<float>, byte) is a bit special
-                        // in that it has different behavior depending on whether the
-                        // second operand is coming from a register or memory. When coming
-                        // from a register, all 4 elements of the vector can be used and it
-                        // is effectively a regular `SimpleSIMD` operation; but when loading
-                        // from memory, it only works with the lowest element and is effectively
-                        // a `SIMDScalar`.
-
-                        assert(supportsAlignedSIMDLoads == false);
-                        assert(supportsUnalignedSIMDLoads == false);
-                        assert(supportsGeneralLoads == false);
-                        assert(supportsSIMDScalarLoads == false);
-
-                        GenTree* op1 = containingNode->gtGetOp1();
-                        GenTree* op2 = nullptr;
-                        GenTree* op3 = nullptr;
-
-                        assert(op1->OperIsList());
-                        assert(containingNode->gtGetOp2() == nullptr);
-
-                        GenTreeArgList* argList = op1->AsArgList();
-
-                        op1     = argList->Current();
-                        argList = argList->Rest();
-
-                        op2     = argList->Current();
-                        argList = argList->Rest();
-
-                        assert(node == op2);
-
-                        op3 = argList->Current();
-
-                        // The upper two bits of the immediate value are ignored if
-                        // op2 comes from memory. In order to support using the upper
-                        // bits, we need to disable containment support if op3 is not
-                        // constant or if the constant is greater than 0x3F (which means
-                        // at least one of the upper two bits is set).
-
-                        if (op3->IsCnsIntOrI())
-                        {
-                            ssize_t ival = op3->AsIntCon()->IconValue();
-                            assert((ival >= 0) && (ival <= 255));
-
-                            supportsSIMDScalarLoads = (ival <= 0x3F);
-                            supportsGeneralLoads    = supportsSIMDScalarLoads;
-                        }
-                        break;
-                    }
-
-                    // We should only get here for integral nodes.
-                    assert(varTypeIsIntegral(node->TypeGet()));
-
-                    assert(supportsAlignedSIMDLoads == false);
-                    assert(supportsUnalignedSIMDLoads == false);
-                    assert(supportsSIMDScalarLoads == false);
-
-                    const unsigned expectedSize = genTypeSize(containingNode->gtSIMDBaseType);
-                    const unsigned operandSize  = genTypeSize(node->TypeGet());
-
-                    supportsGeneralLoads = (operandSize >= expectedSize);
-                    break;
-                }
-
-                case NI_AVX_CompareScalar:
-                {
-                    // These intrinsics only expect 16 or 32-byte nodes for containment
-                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
-
-                    assert(supportsAlignedSIMDLoads == false);
-                    assert(supportsUnalignedSIMDLoads == false);
-
-                    supportsSIMDScalarLoads = true;
-                    supportsGeneralLoads    = supportsSIMDScalarLoads;
-                    break;
-                }
-
-                default:
-                {
-                    assert(supportsAlignedSIMDLoads == false);
-                    assert(supportsGeneralLoads == false);
-                    assert(supportsSIMDScalarLoads == false);
-                    assert(supportsUnalignedSIMDLoads == false);
-                    break;
-                }
-            }
-            break;
-        }
-
-        case HW_Category_SIMDScalar:
-        {
-            assert(supportsAlignedSIMDLoads == false);
-            assert(supportsUnalignedSIMDLoads == false);
-
-            switch (containingIntrinsicId)
-            {
-                case NI_Vector128_CreateScalarUnsafe:
-                case NI_Vector256_CreateScalarUnsafe:
-                {
-                    assert(supportsSIMDScalarLoads == false);
-
-                    const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
-                    const unsigned operandSize  = genTypeSize(node->TypeGet());
-
-                    supportsGeneralLoads = (operandSize == expectedSize);
-                    break;
-                }
-
-                case NI_AVX2_BroadcastScalarToVector128:
-                case NI_AVX2_BroadcastScalarToVector256:
-                {
-                    // The memory form of this already takes a pointer, and cannot be further contained.
-                    // The containable form is the one that takes a SIMD value, that may be in memory.
-                    supportsGeneralLoads = (node->TypeGet() == TYP_SIMD16);
-                    break;
-                }
-
-                case NI_SSE_ConvertScalarToVector128Single:
-                case NI_SSE2_ConvertScalarToVector128Double:
-                case NI_SSE2_ConvertScalarToVector128Int32:
-                case NI_SSE2_ConvertScalarToVector128UInt32:
-                case NI_SSE_X64_ConvertScalarToVector128Single:
-                case NI_SSE2_X64_ConvertScalarToVector128Double:
-                case NI_SSE2_X64_ConvertScalarToVector128Int64:
-                case NI_SSE2_X64_ConvertScalarToVector128UInt64:
-                {
-                    if (!varTypeIsIntegral(node->TypeGet()))
-                    {
-                        // The floating-point overload doesn't require any special semantics
-                        assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double);
-                        supportsSIMDScalarLoads = true;
-                        supportsGeneralLoads    = supportsSIMDScalarLoads;
-                        break;
-                    }
-
-                    assert(supportsSIMDScalarLoads == false);
-
-                    const unsigned expectedSize = genTypeSize(genActualType(containingNode->gtSIMDBaseType));
-                    const unsigned operandSize  = genTypeSize(node->TypeGet());
-
-                    supportsGeneralLoads = (operandSize == expectedSize);
-                    break;
-                }
-
-                default:
-                {
-                    // These intrinsics only expect 16 or 32-byte nodes for containment
-                    assert((genTypeSize(node->TypeGet()) == 16) || (genTypeSize(node->TypeGet()) == 32));
-
-                    supportsSIMDScalarLoads = true;
-                    supportsGeneralLoads    = supportsSIMDScalarLoads;
-                    break;
-                }
-            }
-            break;
-        }
-
-        case HW_Category_Scalar:
-        {
-            // We should only get here for integral nodes.
-            assert(varTypeIsIntegral(node->TypeGet()));
-
-            assert(supportsAlignedSIMDLoads == false);
-            assert(supportsUnalignedSIMDLoads == false);
-            assert(supportsSIMDScalarLoads == false);
-
-            unsigned       expectedSize = genTypeSize(containingNode->TypeGet());
-            const unsigned operandSize  = genTypeSize(node->TypeGet());
-
-            // CRC32 codegen depends on its second oprand's type.
-            // Currently, we are using SIMDBaseType to store the op2Type info.
-            if (containingIntrinsicId == NI_SSE42_Crc32)
-            {
-                var_types op2Type = containingNode->gtSIMDBaseType;
-                expectedSize      = genTypeSize(op2Type);
-            }
-
-            supportsGeneralLoads = (operandSize >= expectedSize);
-            break;
-        }
-
-        default:
-        {
-            assert(supportsAlignedSIMDLoads == false);
-            assert(supportsGeneralLoads == false);
-            assert(supportsSIMDScalarLoads == false);
-            assert(supportsUnalignedSIMDLoads == false);
-            break;
-        }
-    }
-
-    noway_assert(supportsRegOptional != nullptr);
-    *supportsRegOptional = supportsGeneralLoads;
-
-    if (!node->OperIsHWIntrinsic())
-    {
-        return supportsGeneralLoads && IsContainableMemoryOp(node);
-    }
-
-    // TODO-XArch: Update this to be table driven, if possible.
-
-    NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->gtHWIntrinsicId;
-
-    switch (intrinsicId)
-    {
-        case NI_SSE_LoadAlignedVector128:
-        case NI_SSE2_LoadAlignedVector128:
-        case NI_AVX_LoadAlignedVector256:
-        {
-            return supportsAlignedSIMDLoads;
-        }
-
-        case NI_SSE_LoadScalarVector128:
-        case NI_SSE2_LoadScalarVector128:
-        {
-            return supportsSIMDScalarLoads;
-        }
-
-        case NI_SSE_LoadVector128:
-        case NI_SSE2_LoadVector128:
-        case NI_AVX_LoadVector256:
-        {
-            return supportsUnalignedSIMDLoads;
-        }
-
-        case NI_AVX_ExtractVector128:
-        case NI_AVX2_ExtractVector128:
-        {
-            return false;
-        }
-
-        default:
-        {
-            assert(!node->isContainableHWIntrinsic());
-            return false;
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------------------
-// ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware
-//                              intrinsic node.
-//
-//  Arguments:
-//     node - The hardware intrinsic node
-//     addr - The address node to try contain
-//
-void Lowering::ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr)
-{
-    assert((addr->TypeGet() == TYP_I_IMPL) || (addr->TypeGet() == TYP_BYREF));
-    TryCreateAddrMode(addr, true);
-    if ((addr->OperIs(GT_CLS_VAR_ADDR, GT_LCL_VAR_ADDR, GT_LCL_FLD_ADDR, GT_LEA) ||
-         (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp))) &&
-        IsSafeToContainMem(node, addr))
-    {
-        MakeSrcContained(node, addr);
-    }
-}
-
-//----------------------------------------------------------------------------------------------
-// ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
-//
-//  Arguments:
-//     node - The hardware intrinsic node.
-//
-void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
-{
-    NamedIntrinsic      intrinsicId = node->gtHWIntrinsicId;
-    HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
-    int                 numArgs     = HWIntrinsicInfo::lookupNumArgs(node);
-    var_types           baseType    = node->gtSIMDBaseType;
-    unsigned            simdSize    = node->gtSIMDSize;
-
-    GenTree* op1 = node->gtGetOp1();
-    GenTree* op2 = node->gtGetOp2();
-    GenTree* op3 = nullptr;
-
-    if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
-    {
-        // AVX2 gather are not containable and always have constant IMM argument
-        if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
-        {
-            GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
-            assert(lastOp != nullptr);
-            MakeSrcContained(node, lastOp);
-        }
-        // Exit early if containment isn't supported
-        return;
-    }
-
-    if (HWIntrinsicInfo::lookupCategory(intrinsicId) == HW_Category_IMM)
-    {
-        GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
-        assert(lastOp != nullptr);
-
-        if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
-        {
-            MakeSrcContained(node, lastOp);
-        }
-    }
-
-    if ((node->gtSIMDSize == 8) || (node->gtSIMDSize == 12))
-    {
-        // TODO-XArch-CQ: Ideally we would key this off of the size containingNode
-        // expects vs the size node actually is or would be if spilled to the stack
-        return;
-    }
-
-    // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
-
-    const bool isCommutative = HWIntrinsicInfo::IsCommutative(intrinsicId);
-
-    if (numArgs == 1)
-    {
-        // One argument intrinsics cannot be commutative
-        assert(!isCommutative);
-
-        assert(!op1->OperIsList());
-        assert(op2 == nullptr);
-
-        switch (category)
-        {
-            case HW_Category_MemoryLoad:
-                ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
-                break;
-
-            case HW_Category_SimpleSIMD:
-            case HW_Category_SIMDScalar:
-            case HW_Category_Scalar:
-            {
-                switch (intrinsicId)
-                {
-                    case NI_SSE_ReciprocalScalar:
-                    case NI_SSE_ReciprocalSqrtScalar:
-                    case NI_SSE_SqrtScalar:
-                    case NI_SSE2_SqrtScalar:
-                    case NI_SSE41_CeilingScalar:
-                    case NI_SSE41_FloorScalar:
-                    case NI_SSE41_RoundCurrentDirectionScalar:
-                    case NI_SSE41_RoundToNearestIntegerScalar:
-                    case NI_SSE41_RoundToNegativeInfinityScalar:
-                    case NI_SSE41_RoundToPositiveInfinityScalar:
-                    case NI_SSE41_RoundToZeroScalar:
-                    {
-                        // These intrinsics have both 1 and 2-operand overloads.
-                        //
-                        // The 1-operand overload basically does `intrinsic(op1, op1)`
-                        //
-                        // Because of this, the operand must be loaded into a register
-                        // and cannot be contained.
-                        return;
-                    }
-
-                    case NI_SSE2_ConvertToInt32:
-                    case NI_SSE2_X64_ConvertToInt64:
-                    case NI_SSE2_ConvertToUInt32:
-                    case NI_SSE2_X64_ConvertToUInt64:
-                    case NI_AVX2_ConvertToInt32:
-                    case NI_AVX2_ConvertToUInt32:
-                    {
-                        if (varTypeIsIntegral(baseType))
-                        {
-                            // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm" and don't
-                            // currently support containment.
-                            return;
-                        }
-
-                        break;
-                    }
-
-                    case NI_SSE41_ConvertToVector128Int16:
-                    case NI_SSE41_ConvertToVector128Int32:
-                    case NI_SSE41_ConvertToVector128Int64:
-                    case NI_AVX2_ConvertToVector256Int16:
-                    case NI_AVX2_ConvertToVector256Int32:
-                    case NI_AVX2_ConvertToVector256Int64:
-                        if (!varTypeIsSIMD(op1->gtType))
-                        {
-                            ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
-                            return;
-                        }
-                        break;
-
-                    default:
-                    {
-                        break;
-                    }
-                }
-
-                bool supportsRegOptional = false;
-
-                if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                {
-                    MakeSrcContained(node, op1);
-                }
-                else if (supportsRegOptional)
-                {
-                    op1->SetRegOptional();
-                }
-                break;
-            }
-
-            default:
-            {
-                unreached();
-                break;
-            }
-        }
-    }
-    else
-    {
-        if (numArgs == 2)
-        {
-            assert(!op1->OperIsList());
-            assert(op2 != nullptr);
-            assert(!op2->OperIsList());
-
-            switch (category)
-            {
-                case HW_Category_MemoryLoad:
-                    if ((intrinsicId == NI_AVX_MaskLoad) || (intrinsicId == NI_AVX2_MaskLoad))
-                    {
-                        ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
-                    }
-                    else
-                    {
-                        ContainCheckHWIntrinsicAddr(node, node->gtGetOp2());
-                    }
-                    break;
-
-                case HW_Category_MemoryStore:
-                    ContainCheckHWIntrinsicAddr(node, node->gtGetOp1());
-
-                    if (((intrinsicId == NI_SSE_Store) || (intrinsicId == NI_SSE2_Store)) && op2->OperIsHWIntrinsic() &&
-                        ((op2->AsHWIntrinsic()->gtHWIntrinsicId == NI_AVX_ExtractVector128) ||
-                         (op2->AsHWIntrinsic()->gtHWIntrinsicId == NI_AVX2_ExtractVector128)) &&
-                        op2->gtGetOp2()->IsIntegralConst())
-                    {
-                        MakeSrcContained(node, op2);
-                    }
-                    break;
-
-                case HW_Category_SimpleSIMD:
-                case HW_Category_SIMDScalar:
-                case HW_Category_Scalar:
-                {
-                    bool supportsRegOptional = false;
-
-                    if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                    {
-                        MakeSrcContained(node, op2);
-                    }
-                    else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) ||
-                              (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) &&
-                             IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                    {
-                        MakeSrcContained(node, op1);
-
-                        // Swap the operands here to make the containment checks in codegen significantly simpler
-                        node->gtOp1 = op2;
-                        node->gtOp2 = op1;
-                    }
-                    else if (supportsRegOptional)
-                    {
-                        op2->SetRegOptional();
-
-                        // TODO-XArch-CQ: For commutative nodes, either operand can be reg-optional.
-                        //                https://github.com/dotnet/runtime/issues/6358
-                    }
-                    break;
-                }
-
-                case HW_Category_IMM:
-                {
-                    // We don't currently have any IMM intrinsics which are also commutative
-                    assert(!isCommutative);
-                    bool supportsRegOptional = false;
-
-                    switch (intrinsicId)
-                    {
-                        case NI_SSE2_Extract:
-                        case NI_SSE41_Extract:
-                        case NI_SSE41_X64_Extract:
-                        case NI_AVX_ExtractVector128:
-                        case NI_AVX2_ExtractVector128:
-                        {
-                            // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm, imm8" and don't
-                            // currently support containment.
-                            break;
-                        }
-
-                        case NI_SSE2_ShiftLeftLogical:
-                        case NI_SSE2_ShiftRightArithmetic:
-                        case NI_SSE2_ShiftRightLogical:
-                        case NI_AVX2_ShiftLeftLogical:
-                        case NI_AVX2_ShiftRightArithmetic:
-                        case NI_AVX2_ShiftRightLogical:
-                        {
-                            // These intrinsics can have op2 be imm or reg/mem
-
-                            if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2))
-                            {
-                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                                {
-                                    MakeSrcContained(node, op2);
-                                }
-                                else if (supportsRegOptional)
-                                {
-                                    op2->SetRegOptional();
-                                }
-                            }
-                            break;
-                        }
-
-                        case NI_SSE2_Shuffle:
-                        case NI_SSE2_ShuffleHigh:
-                        case NI_SSE2_ShuffleLow:
-                        case NI_AVX2_Permute4x64:
-                        case NI_AVX2_Shuffle:
-                        case NI_AVX2_ShuffleHigh:
-                        case NI_AVX2_ShuffleLow:
-                        {
-                            // These intrinsics have op2 as an imm and op1 as a reg/mem
-
-                            if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                            {
-                                MakeSrcContained(node, op1);
-                            }
-                            else if (supportsRegOptional)
-                            {
-                                op1->SetRegOptional();
-                            }
-                            break;
-                        }
-
-                        case NI_AVX_Permute:
-                        {
-                            // These intrinsics can have op2 be imm or reg/mem
-                            // They also can have op1 be reg/mem and op2 be imm
-
-                            if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
-                            {
-                                if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                                {
-                                    MakeSrcContained(node, op1);
-                                }
-                                else if (supportsRegOptional)
-                                {
-                                    op1->SetRegOptional();
-                                }
-                            }
-                            else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                            {
-                                MakeSrcContained(node, op2);
-                            }
-                            else if (supportsRegOptional)
-                            {
-                                op2->SetRegOptional();
-                            }
-                            break;
-                        }
-
-                        case NI_AES_KeygenAssist:
-                        {
-                            if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                            {
-                                MakeSrcContained(node, op1);
-                            }
-                            else if (supportsRegOptional)
-                            {
-                                op1->SetRegOptional();
-                            }
-                            break;
-                        }
-
-                        case NI_SSE2_ShiftLeftLogical128BitLane:
-                        case NI_SSE2_ShiftRightLogical128BitLane:
-                        case NI_AVX2_ShiftLeftLogical128BitLane:
-                        case NI_AVX2_ShiftRightLogical128BitLane:
-                        {
-#if DEBUG
-                            // These intrinsics should have been marked contained by the general-purpose handling
-                            // earlier in the method.
-
-                            GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
-                            assert(lastOp != nullptr);
-
-                            if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI())
-                            {
-                                assert(lastOp->isContained());
-                            }
-#endif
-
-                            break;
-                        }
-
-                        default:
-                        {
-                            assert(!"Unhandled containment for binary hardware intrinsic with immediate operand");
-                            break;
-                        }
-                    }
-
-                    break;
-                }
-
-                default:
-                {
-                    unreached();
-                    break;
-                }
-            }
-        }
-        else if (numArgs == 3)
-        {
-            // three argument intrinsics should not be marked commutative
-            assert(!isCommutative);
-
-            assert(op1->OperIsList());
-            assert(op2 == nullptr);
-
-            GenTreeArgList* argList         = op1->AsArgList();
-            GenTreeArgList* originalArgList = argList;
-
-            op1     = argList->Current();
-            argList = argList->Rest();
-
-            op2     = argList->Current();
-            argList = argList->Rest();
-
-            op3 = argList->Current();
-            assert(argList->Rest() == nullptr);
-
-            switch (category)
-            {
-                case HW_Category_MemoryStore:
-                    ContainCheckHWIntrinsicAddr(node, node->gtGetOp1()->AsOp()->gtGetOp1());
-                    break;
-
-                case HW_Category_SimpleSIMD:
-                case HW_Category_SIMDScalar:
-                case HW_Category_Scalar:
-                {
-                    if ((intrinsicId >= NI_FMA_MultiplyAdd) && (intrinsicId <= NI_FMA_MultiplySubtractNegatedScalar))
-                    {
-                        bool supportsRegOptional = false;
-
-                        if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional))
-                        {
-                            // 213 form: op1 = (op2 * op1) + [op3]
-                            MakeSrcContained(node, op3);
-                        }
-                        else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                        {
-                            // 132 form: op1 = (op1 * op3) + [op2]
-                            MakeSrcContained(node, op2);
-                        }
-                        else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                        {
-                            // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
-
-                            if (!HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
-                            {
-                                // 231 form: op3 = (op2 * op3) + [op1]
-                                MakeSrcContained(node, op1);
-                            }
-                        }
-                        else
-                        {
-                            assert(supportsRegOptional);
-
-                            // TODO-XArch-CQ: Technically any one of the three operands can
-                            //                be reg-optional. With a limitation on op1 where
-                            //                it can only be so if CopyUpperBits is off.
-                            //                https://github.com/dotnet/runtime/issues/6358
-
-                            // 213 form: op1 = (op2 * op1) + op3
-                            op3->SetRegOptional();
-                        }
-                    }
-                    else
-                    {
-                        bool supportsRegOptional = false;
-
-                        switch (intrinsicId)
-                        {
-                            case NI_SSE41_BlendVariable:
-                            case NI_AVX_BlendVariable:
-                            case NI_AVX2_BlendVariable:
-                            {
-                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                                {
-                                    MakeSrcContained(node, op2);
-                                }
-                                else if (supportsRegOptional)
-                                {
-                                    op2->SetRegOptional();
-                                }
-                                break;
-                            }
-
-                            case NI_BMI2_MultiplyNoFlags:
-                            case NI_BMI2_X64_MultiplyNoFlags:
-                            {
-                                if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                                {
-                                    MakeSrcContained(node, op2);
-                                }
-                                else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional))
-                                {
-                                    MakeSrcContained(node, op1);
-                                    // MultiplyNoFlags is a Commutative operation, so swap the first two operands here
-                                    // to make the containment checks in codegen significantly simpler
-                                    *(originalArgList->pCurrent())         = op2;
-                                    *(originalArgList->Rest()->pCurrent()) = op1;
-                                }
-                                else if (supportsRegOptional)
-                                {
-                                    op2->SetRegOptional();
-                                }
-                                break;
-                            }
-
-                            default:
-                            {
-                                unreached();
-                                break;
-                            }
-                        }
-                    }
-                    break;
-                }
-
-                case HW_Category_IMM:
-                {
-                    bool supportsRegOptional = false;
-
-                    switch (intrinsicId)
-                    {
-                        case NI_SSE_Shuffle:
-                        case NI_SSE2_Insert:
-                        case NI_SSE2_Shuffle:
-                        case NI_SSSE3_AlignRight:
-                        case NI_SSE41_Blend:
-                        case NI_SSE41_DotProduct:
-                        case NI_SSE41_Insert:
-                        case NI_SSE41_X64_Insert:
-                        case NI_SSE41_MultipleSumAbsoluteDifferences:
-                        case NI_AVX_Blend:
-                        case NI_AVX_Compare:
-                        case NI_AVX_CompareScalar:
-                        case NI_AVX_DotProduct:
-                        case NI_AVX_InsertVector128:
-                        case NI_AVX_Permute2x128:
-                        case NI_AVX_Shuffle:
-                        case NI_AVX2_AlignRight:
-                        case NI_AVX2_Blend:
-                        case NI_AVX2_InsertVector128:
-                        case NI_AVX2_MultipleSumAbsoluteDifferences:
-                        case NI_AVX2_Permute2x128:
-                        case NI_PCLMULQDQ_CarrylessMultiply:
-                        {
-                            if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional))
-                            {
-                                MakeSrcContained(node, op2);
-                            }
-                            else if (supportsRegOptional)
-                            {
-                                op2->SetRegOptional();
-                            }
-                            break;
-                        }
-
-                        default:
-                        {
-                            assert(!"Unhandled containment for ternary hardware intrinsic with immediate operand");
-                            break;
-                        }
-                    }
-
-                    break;
-                }
-
-                default:
-                {
-                    unreached();
-                    break;
-                }
-            }
-        }
-        else
-        {
-            unreached();
-        }
-    }
-}
-#endif // FEATURE_HW_INTRINSICS
-
-//------------------------------------------------------------------------
-// ContainCheckFloatBinary: determine whether the sources of a floating point binary node should be contained.
-//
-// Arguments:
-//    node - pointer to the node
-//
-void Lowering::ContainCheckFloatBinary(GenTreeOp* node)
-{
-    assert(node->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV) && varTypeIsFloating(node));
-
-    // overflow operations aren't supported on float/double types.
-    assert(!node->gtOverflowEx());
-
-    GenTree* op1 = node->gtGetOp1();
-    GenTree* op2 = node->gtGetOp2();
-
-    // No implicit conversions at this stage as the expectation is that
-    // everything is made explicit by adding casts.
-    assert(op1->TypeGet() == op2->TypeGet());
-
-    bool isSafeToContainOp1 = true;
-    bool isSafeToContainOp2 = true;
-
-    if (op2->IsCnsNonZeroFltOrDbl())
-    {
-        MakeSrcContained(node, op2);
-    }
-    else if (IsContainableMemoryOp(op2))
-    {
-        isSafeToContainOp2 = IsSafeToContainMem(node, op2);
-        if (isSafeToContainOp2)
-        {
-            MakeSrcContained(node, op2);
-        }
-    }
-
-    if (!op2->isContained() && node->OperIsCommutative())
-    {
-        // Though we have GT_ADD(op1=memOp, op2=non-memOp, we try to reorder the operands
-        // as long as it is safe so that the following efficient code sequence is generated:
-        //      addss/sd targetReg, memOp    (if op1Reg == targetReg) OR
-        //      movaps targetReg, op2Reg; addss/sd targetReg, [memOp]
-        //
-        // Instead of
-        //      movss op1Reg, [memOp]; addss/sd targetReg, Op2Reg  (if op1Reg == targetReg) OR
-        //      movss op1Reg, [memOp]; movaps targetReg, op1Reg, addss/sd targetReg, Op2Reg
-
-        if (op1->IsCnsNonZeroFltOrDbl())
-        {
-            MakeSrcContained(node, op1);
-        }
-        else if (IsContainableMemoryOp(op1))
-        {
-            isSafeToContainOp1 = IsSafeToContainMem(node, op1);
-            if (isSafeToContainOp1)
-            {
-                MakeSrcContained(node, op1);
-            }
-        }
-    }
-
-    if (!op1->isContained() && !op2->isContained())
-    {
-        // If there are no containable operands, we can make an operand reg optional.
-        // IsSafeToContainMem is expensive so we call it at most once for each operand
-        // in this method. If we already called IsSafeToContainMem, it must have returned false;
-        // otherwise, the corresponding operand (op1 or op2) would be contained.
-        isSafeToContainOp1 = isSafeToContainOp1 && IsSafeToContainMem(node, op1);
-        isSafeToContainOp2 = isSafeToContainOp2 && IsSafeToContainMem(node, op2);
-        SetRegOptionalForBinOp(node, isSafeToContainOp1, isSafeToContainOp2);
-    }
-}
-
-#endif // defined (TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/lsrawasm.cpp b/src/coreclr/jit/lsrawasm.cpp
deleted file mode 100644
index 67f7f658aa3f..000000000000
--- a/src/coreclr/jit/lsrawasm.cpp
+++ /dev/null
@@ -1,1528 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XX                                                                           XX
-XX                    Register Requirements for AMD64                        XX
-XX                                                                           XX
-XX  This encapsulates all the logic for setting register requirements for    XX
-XX  the AMD64 architecture.                                                  XX
-XX                                                                           XX
-XX                                                                           XX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-*/
-
-#include "jitpch.h"
-#ifdef _MSC_VER
-#pragma hdrstop
-#endif
-
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
-
-#include "jit.h"
-#include "sideeffects.h"
-#include "lower.h"
-
-//------------------------------------------------------------------------
-// BuildNode: Build the RefPositions for for a node
-//
-// Arguments:
-//    treeNode - the node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-// Notes:
-// Preconditions:
-//    LSRA Has been initialized.
-//
-// Postconditions:
-//    RefPositions have been built for all the register defs and uses required
-//    for this node.
-//
-int LinearScan::BuildNode(GenTree* tree)
-{
-    assert(false);
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// getTgtPrefOperands: Identify whether the operands of an Op should be preferenced to the target.
-//
-// Arguments:
-//    tree    - the node of interest.
-//    prefOp1 - a bool "out" parameter indicating, on return, whether op1 should be preferenced to the target.
-//    prefOp2 - a bool "out" parameter indicating, on return, whether op2 should be preferenced to the target.
-//
-// Return Value:
-//    This has two "out" parameters for returning the results (see above).
-//
-// Notes:
-//    The caller is responsible for initializing the two "out" parameters to false.
-//
-void LinearScan::getTgtPrefOperands(GenTreeOp* tree, bool& prefOp1, bool& prefOp2)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------------
-// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
-//
-// Arguments:
-//    tree      - a binary tree node
-//
-// Return Value:
-//    Returns true if we can use the read-modify-write instruction form
-//
-// Notes:
-//    This is used to determine whether to preference the source to the destination register.
-//
-bool LinearScan::isRMWRegOper(GenTree* tree)
-{
-    assert(false);
-    return false;
-}
-
-// Support for building RefPositions for RMW nodes.
-int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
-{
-    assert(false);
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// BuildShiftRotate: Set the NodeInfo for a shift or rotate.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildShiftRotate(GenTree* tree)
-{
-    assert(false);
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// BuildCall: Set the NodeInfo for a call.
-//
-// Arguments:
-//    call      - The call node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildCall(GenTreeCall* call)
-{
-    bool                  hasMultiRegRetVal = false;
-    const ReturnTypeDesc* retTypeDesc       = nullptr;
-    int                   srcCount          = 0;
-    int                   dstCount          = 0;
-    regMaskTP             dstCandidates     = RBM_NONE;
-
-    assert(!call->isContained());
-    if (call->TypeGet() != TYP_VOID)
-    {
-        hasMultiRegRetVal = call->HasMultiRegRetVal();
-        if (hasMultiRegRetVal)
-        {
-            // dst count = number of registers in which the value is returned by call
-            retTypeDesc = call->GetReturnTypeDesc();
-            dstCount    = retTypeDesc->GetReturnRegCount();
-        }
-        else
-        {
-            dstCount = 1;
-        }
-    }
-
-    GenTree* ctrlExpr = call->gtControlExpr;
-    if (call->gtCallType == CT_INDIRECT)
-    {
-        ctrlExpr = call->gtCallAddr;
-    }
-
-    RegisterType registerType = regType(call);
-
-    // Set destination candidates for return value of the call.
-    CLANG_FORMAT_COMMENT_ANCHOR;
-
-#ifdef TARGET_X86
-    if (call->IsHelperCall(compiler, CORINFO_HELP_INIT_PINVOKE_FRAME))
-    {
-        // The x86 CORINFO_HELP_INIT_PINVOKE_FRAME helper uses a custom calling convention that returns with
-        // TCB in REG_PINVOKE_TCB. AMD64/ARM64 use the standard calling convention. fgMorphCall() sets the
-        // correct argument registers.
-        dstCandidates = RBM_PINVOKE_TCB;
-    }
-    else
-#endif // TARGET_X86
-        if (hasMultiRegRetVal)
-    {
-        assert(retTypeDesc != nullptr);
-        dstCandidates = retTypeDesc->GetABIReturnRegs();
-        assert((int)genCountBits(dstCandidates) == dstCount);
-    }
-    else if (varTypeUsesFloatReg(registerType))
-    {
-#ifdef TARGET_X86
-        // The return value will be on the X87 stack, and we will need to move it.
-        dstCandidates = allRegs(registerType);
-#else  // !TARGET_X86
-        dstCandidates = RBM_FLOATRET;
-#endif // !TARGET_X86
-    }
-    else if (registerType == TYP_LONG)
-    {
-        dstCandidates = RBM_LNGRET;
-    }
-    else
-    {
-        dstCandidates = RBM_INTRET;
-    }
-
-    // number of args to a call =
-    // callRegArgs + (callargs - placeholders, setup, etc)
-    // there is an explicit thisPtr but it is redundant
-
-    bool callHasFloatRegArgs = false;
-    bool isVarArgs           = call->IsVarargs();
-
-    // First, determine internal registers.
-    // We will need one for any float arguments to a varArgs call.
-    for (GenTreeCall::Use& use : call->LateArgs())
-    {
-        GenTree* argNode = use.GetNode();
-        if (argNode->OperIsPutArgReg())
-        {
-            HandleFloatVarArgs(call, argNode, &callHasFloatRegArgs);
-        }
-        else if (argNode->OperGet() == GT_FIELD_LIST)
-        {
-            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
-            {
-                assert(use.GetNode()->OperIsPutArgReg());
-                HandleFloatVarArgs(call, use.GetNode(), &callHasFloatRegArgs);
-            }
-        }
-    }
-
-    // Now, count reg args
-    for (GenTreeCall::Use& use : call->LateArgs())
-    {
-        // By this point, lowering has ensured that all call arguments are one of the following:
-        // - an arg setup store
-        // - an arg placeholder
-        // - a nop
-        // - a copy blk
-        // - a field list
-        // - a put arg
-        //
-        // Note that this property is statically checked by LinearScan::CheckBlock.
-        GenTree* argNode = use.GetNode();
-
-        // Each register argument corresponds to one source.
-        if (argNode->OperIsPutArgReg())
-        {
-            srcCount++;
-            BuildUse(argNode, genRegMask(argNode->GetRegNum()));
-        }
-#ifdef UNIX_AMD64_ABI
-        else if (argNode->OperGet() == GT_FIELD_LIST)
-        {
-            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
-            {
-                assert(use.GetNode()->OperIsPutArgReg());
-                srcCount++;
-                BuildUse(use.GetNode(), genRegMask(use.GetNode()->GetRegNum()));
-            }
-        }
-#endif // UNIX_AMD64_ABI
-
-#ifdef DEBUG
-        // In DEBUG only, check validity with respect to the arg table entry.
-
-        fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, argNode);
-        assert(curArgTabEntry);
-
-        if (curArgTabEntry->GetRegNum() == REG_STK)
-        {
-            // late arg that is not passed in a register
-            assert(argNode->gtOper == GT_PUTARG_STK);
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-            // If the node is TYP_STRUCT and it is put on stack with
-            // putarg_stk operation, we consume and produce no registers.
-            // In this case the embedded Obj node should not produce
-            // registers too since it is contained.
-            // Note that if it is a SIMD type the argument will be in a register.
-            if (argNode->TypeGet() == TYP_STRUCT)
-            {
-                assert(argNode->gtGetOp1() != nullptr && argNode->gtGetOp1()->OperGet() == GT_OBJ);
-                assert(argNode->gtGetOp1()->isContained());
-            }
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-            continue;
-        }
-#ifdef UNIX_AMD64_ABI
-        if (argNode->OperGet() == GT_FIELD_LIST)
-        {
-            assert(argNode->isContained());
-            assert(varTypeIsStruct(argNode) || curArgTabEntry->isStruct);
-
-            unsigned regIndex = 0;
-            for (GenTreeFieldList::Use& use : argNode->AsFieldList()->Uses())
-            {
-                const regNumber argReg = curArgTabEntry->GetRegNum(regIndex);
-                assert(use.GetNode()->GetRegNum() == argReg);
-                regIndex++;
-            }
-        }
-        else
-#endif // UNIX_AMD64_ABI
-        {
-            const regNumber argReg = curArgTabEntry->GetRegNum();
-            assert(argNode->GetRegNum() == argReg);
-        }
-#endif // DEBUG
-    }
-
-#ifdef DEBUG
-    // Now, count stack args
-    // Note that these need to be computed into a register, but then
-    // they're just stored to the stack - so the reg doesn't
-    // need to remain live until the call.  In fact, it must not
-    // because the code generator doesn't actually consider it live,
-    // so it can't be spilled.
-
-    for (GenTreeCall::Use& use : call->Args())
-    {
-        GenTree* arg = use.GetNode();
-        if (!(arg->gtFlags & GTF_LATE_ARG) && !arg)
-        {
-            if (arg->IsValue() && !arg->isContained())
-            {
-                assert(arg->IsUnusedValue());
-            }
-        }
-    }
-#endif // DEBUG
-
-    // set reg requirements on call target represented as control sequence.
-    if (ctrlExpr != nullptr)
-    {
-        regMaskTP ctrlExprCandidates = RBM_NONE;
-
-        // In case of fast tail implemented as jmp, make sure that gtControlExpr is
-        // computed into a register.
-        if (call->IsFastTailCall())
-        {
-            assert(!ctrlExpr->isContained());
-            // Fast tail call - make sure that call target is always computed in RAX
-            // so that epilog sequence can generate "jmp rax" to achieve fast tail call.
-            ctrlExprCandidates = RBM_RAX;
-        }
-#ifdef TARGET_X86
-        else if (call->IsVirtualStub() && (call->gtCallType == CT_INDIRECT))
-        {
-            // On x86, we need to generate a very specific pattern for indirect VSD calls:
-            //
-            //    3-byte nop
-            //    call dword ptr [eax]
-            //
-            // Where EAX is also used as an argument to the stub dispatch helper. Make
-            // sure that the call target address is computed into EAX in this case.
-            assert(ctrlExpr->isIndir() && ctrlExpr->isContained());
-            ctrlExprCandidates = RBM_VIRTUAL_STUB_TARGET;
-        }
-#endif // TARGET_X86
-
-#if FEATURE_VARARG
-        // If it is a fast tail call, it is already preferenced to use RAX.
-        // Therefore, no need set src candidates on call tgt again.
-        if (call->IsVarargs() && callHasFloatRegArgs && !call->IsFastTailCall())
-        {
-            // Don't assign the call target to any of the argument registers because
-            // we will use them to also pass floating point arguments as required
-            // by Amd64 ABI.
-            ctrlExprCandidates = allRegs(TYP_INT) & ~(RBM_ARG_REGS);
-        }
-#endif // !FEATURE_VARARG
-        srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
-    }
-
-    buildInternalRegisterUses();
-
-    // Now generate defs and kills.
-    regMaskTP killMask = getKillSetForCall(call);
-    BuildDefsWithKills(call, dstCount, dstCandidates, killMask);
-    return srcCount;
-}
-
-//------------------------------------------------------------------------
-// BuildBlockStore: Build the RefPositions for a block store node.
-//
-// Arguments:
-//    blkNode - The block store node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
-{
-    assert(false);
-    return 0;
-}
-
-#ifdef FEATURE_PUT_STRUCT_ARG_STK
-//------------------------------------------------------------------------
-// BuildPutArgStk: Set the NodeInfo for a GT_PUTARG_STK.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
-{
-    assert(false);
-    return 0;
-}
-#endif // FEATURE_PUT_STRUCT_ARG_STK
-
-//------------------------------------------------------------------------
-// BuildLclHeap: Set the NodeInfo for a GT_LCLHEAP.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildLclHeap(GenTree* tree)
-{
-    int srcCount = 1;
-
-    // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
-    // Here '-' means don't care.
-    //
-    //     Size?                    Init Memory?         # temp regs
-    //      0                            -                  0 (returns 0)
-    //      const and <=6 reg words      -                  0 (pushes '0')
-    //      const and >6 reg words       Yes                0 (pushes '0')
-    //      const and <PageSize          No                 0 (amd64) 1 (x86)
-    //                                                        (x86:tmpReg for sutracting from esp)
-    //      const and >=PageSize         No                 2 (regCnt and tmpReg for subtracing from sp)
-    //      Non-const                    Yes                0 (regCnt=targetReg and pushes '0')
-    //      Non-const                    No                 2 (regCnt and tmpReg for subtracting from sp)
-    //
-    // Note: Here we don't need internal register to be different from targetReg.
-    // Rather, require it to be different from operand's reg.
-
-    GenTree* size = tree->gtGetOp1();
-    if (size->IsCnsIntOrI())
-    {
-        assert(size->isContained());
-        srcCount       = 0;
-        size_t sizeVal = size->AsIntCon()->gtIconVal;
-
-        if (sizeVal == 0)
-        {
-            buildInternalIntRegisterDefForNode(tree);
-        }
-        else
-        {
-            // Compute the amount of memory to properly STACK_ALIGN.
-            // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
-            // This should also help in debugging as we can examine the original size specified with localloc.
-            sizeVal = AlignUp(sizeVal, STACK_ALIGN);
-
-            // For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
-            // we will generate 'push 0'.
-            assert((sizeVal % REGSIZE_BYTES) == 0);
-            size_t cntRegSizedWords = sizeVal / REGSIZE_BYTES;
-            if (cntRegSizedWords > 6)
-            {
-                if (!compiler->info.compInitMem)
-                {
-                    // No need to initialize allocated stack space.
-                    if (sizeVal < compiler->eeGetPageSize())
-                    {
-#ifdef TARGET_X86
-                        // x86 needs a register here to avoid generating "sub" on ESP.
-                        buildInternalIntRegisterDefForNode(tree);
-#endif
-                    }
-                    else
-                    {
-                        // We need two registers: regCnt and RegTmp
-                        buildInternalIntRegisterDefForNode(tree);
-                        buildInternalIntRegisterDefForNode(tree);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        if (!compiler->info.compInitMem)
-        {
-            buildInternalIntRegisterDefForNode(tree);
-            buildInternalIntRegisterDefForNode(tree);
-        }
-        BuildUse(size);
-    }
-    buildInternalRegisterUses();
-    BuildDef(tree);
-    return srcCount;
-}
-
-//------------------------------------------------------------------------
-// BuildModDiv: Set the NodeInfo for GT_MOD/GT_DIV/GT_UMOD/GT_UDIV.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildModDiv(GenTree* tree)
-{
-    GenTree*     op1           = tree->gtGetOp1();
-    GenTree*     op2           = tree->gtGetOp2();
-    regMaskTP    dstCandidates = RBM_NONE;
-    RefPosition* internalDef   = nullptr;
-    int          srcCount      = 0;
-
-    if (varTypeIsFloating(tree->TypeGet()))
-    {
-        return BuildSimple(tree);
-    }
-
-    // Amd64 Div/Idiv instruction:
-    //    Dividend in RAX:RDX  and computes
-    //    Quotient in RAX, Remainder in RDX
-
-    if (tree->OperGet() == GT_MOD || tree->OperGet() == GT_UMOD)
-    {
-        // We are interested in just the remainder.
-        // RAX is used as a trashable register during computation of remainder.
-        dstCandidates = RBM_RDX;
-    }
-    else
-    {
-        // We are interested in just the quotient.
-        // RDX gets used as trashable register during computation of quotient
-        dstCandidates = RBM_RAX;
-    }
-
-#ifdef TARGET_X86
-    if (op1->OperGet() == GT_LONG)
-    {
-        assert(op1->isContained());
-
-        // To avoid reg move would like to have op1's low part in RAX and high part in RDX.
-        GenTree* loVal = op1->gtGetOp1();
-        GenTree* hiVal = op1->gtGetOp2();
-        assert(!loVal->isContained() && !hiVal->isContained());
-
-        assert(op2->IsCnsIntOrI());
-        assert(tree->OperGet() == GT_UMOD);
-
-        // This situation also requires an internal register.
-        buildInternalIntRegisterDefForNode(tree);
-
-        BuildUse(loVal, RBM_EAX);
-        BuildUse(hiVal, RBM_EDX);
-        srcCount = 2;
-    }
-    else
-#endif
-    {
-        // If possible would like to have op1 in RAX to avoid a register move.
-        RefPosition* op1Use = BuildUse(op1, RBM_EAX);
-        tgtPrefUse          = op1Use;
-        srcCount            = 1;
-    }
-
-    srcCount += BuildDelayFreeUses(op2, op1, allRegs(TYP_INT) & ~(RBM_RAX | RBM_RDX));
-
-    buildInternalRegisterUses();
-
-    regMaskTP killMask = getKillSetForModDiv(tree->AsOp());
-    BuildDefsWithKills(tree, 1, dstCandidates, killMask);
-    return srcCount;
-}
-
-//------------------------------------------------------------------------
-// BuildIntrinsic: Set the NodeInfo for a GT_INTRINSIC.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildIntrinsic(GenTree* tree)
-{
-    // Both operand and its result must be of floating point type.
-    GenTree* op1 = tree->gtGetOp1();
-    assert(varTypeIsFloating(op1));
-    assert(op1->TypeGet() == tree->TypeGet());
-    RefPosition* internalFloatDef = nullptr;
-
-    switch (tree->AsIntrinsic()->gtIntrinsicName)
-    {
-        case NI_System_Math_Abs:
-            // Abs(float x) = x & 0x7fffffff
-            // Abs(double x) = x & 0x7ffffff ffffffff
-
-            // In case of Abs we need an internal register to hold mask.
-
-            // TODO-XArch-CQ: avoid using an internal register for the mask.
-            // Andps or andpd both will operate on 128-bit operands.
-            // The data section constant to hold the mask is a 64-bit size.
-            // Therefore, we need both the operand and mask to be in
-            // xmm register. When we add support in emitter to emit 128-bit
-            // data constants and instructions that operate on 128-bit
-            // memory operands we can avoid the need for an internal register.
-            internalFloatDef = buildInternalFloatRegisterDefForNode(tree, internalFloatRegCandidates());
-            break;
-
-#ifdef TARGET_X86
-        case NI_System_Math_Cos:
-        case NI_System_Math_Sin:
-            NYI_X86("Math intrinsics Cos and Sin");
-            break;
-#endif // TARGET_X86
-
-        case NI_System_Math_Sqrt:
-        case NI_System_Math_Round:
-        case NI_System_Math_Ceiling:
-        case NI_System_Math_Floor:
-            break;
-
-        default:
-            // Right now only Sqrt/Abs are treated as math intrinsics
-            noway_assert(!"Unsupported math intrinsic");
-            unreached();
-            break;
-    }
-    assert(tree->gtGetOp2IfPresent() == nullptr);
-    int srcCount;
-    if (op1->isContained())
-    {
-        srcCount = BuildOperandUses(op1);
-    }
-    else
-    {
-        tgtPrefUse = BuildUse(op1);
-        srcCount   = 1;
-    }
-    if (internalFloatDef != nullptr)
-    {
-        buildInternalRegisterUses();
-    }
-    BuildDef(tree);
-    return srcCount;
-}
-
-#ifdef FEATURE_SIMD
-//------------------------------------------------------------------------
-// BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
-//
-// Arguments:
-//    tree       - The GT_SIMD node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
-{
-    // All intrinsics have a dstCount of 1
-    assert(simdTree->IsValue());
-
-    bool      buildUses     = true;
-    regMaskTP dstCandidates = RBM_NONE;
-
-    if (simdTree->isContained())
-    {
-        // Only SIMDIntrinsicInit can be contained
-        assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicInit);
-    }
-    SetContainsAVXFlags(simdTree->gtSIMDSize);
-    GenTree* op1      = simdTree->gtGetOp1();
-    GenTree* op2      = simdTree->gtGetOp2();
-    int      srcCount = 0;
-
-    switch (simdTree->gtSIMDIntrinsicID)
-    {
-        case SIMDIntrinsicInit:
-        {
-            // This sets all fields of a SIMD struct to the given value.
-            // Mark op1 as contained if it is either zero or int constant of all 1's,
-            // or a float constant with 16 or 32 byte simdType (AVX case)
-            //
-            // Note that for small int base types, the initVal has been constructed so that
-            // we can use the full int value.
-            CLANG_FORMAT_COMMENT_ANCHOR;
-
-#if !defined(TARGET_64BIT)
-            if (op1->OperGet() == GT_LONG)
-            {
-                assert(op1->isContained());
-                GenTree* op1lo = op1->gtGetOp1();
-                GenTree* op1hi = op1->gtGetOp2();
-
-                if (op1lo->isContained())
-                {
-                    srcCount = 0;
-                    assert(op1hi->isContained());
-                    assert((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
-                           (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)));
-                }
-                else
-                {
-                    srcCount = 2;
-                    buildInternalFloatRegisterDefForNode(simdTree);
-                    setInternalRegsDelayFree = true;
-                }
-
-                if (srcCount == 2)
-                {
-                    BuildUse(op1lo, RBM_EAX);
-                    BuildUse(op1hi, RBM_EDX);
-                }
-                buildUses = false;
-            }
-#endif // !defined(TARGET_64BIT)
-        }
-        break;
-
-        case SIMDIntrinsicInitN:
-        {
-            var_types baseType = simdTree->gtSIMDBaseType;
-            srcCount           = (short)(simdTree->gtSIMDSize / genTypeSize(baseType));
-            // Need an internal register to stitch together all the values into a single vector in a SIMD reg.
-            buildInternalFloatRegisterDefForNode(simdTree);
-            int initCount = 0;
-            for (GenTree* list = op1; list != nullptr; list = list->gtGetOp2())
-            {
-                assert(list->OperGet() == GT_LIST);
-                GenTree* listItem = list->gtGetOp1();
-                assert(listItem->TypeGet() == baseType);
-                assert(!listItem->isContained());
-                BuildUse(listItem);
-                initCount++;
-            }
-            assert(initCount == srcCount);
-            buildUses = false;
-        }
-        break;
-
-        case SIMDIntrinsicInitArray:
-            // We have an array and an index, which may be contained.
-            break;
-
-        case SIMDIntrinsicSub:
-        case SIMDIntrinsicBitwiseAnd:
-        case SIMDIntrinsicBitwiseOr:
-            break;
-
-        case SIMDIntrinsicEqual:
-            break;
-
-        case SIMDIntrinsicGetItem:
-        {
-            // This implements get_Item method. The sources are:
-            //  - the source SIMD struct
-            //  - index (which element to get)
-            // The result is baseType of SIMD struct.
-            // op1 may be a contained memory op, but if so we will consume its address.
-            // op2 may be a contained constant.
-            op1 = simdTree->gtGetOp1();
-            op2 = simdTree->gtGetOp2();
-
-            if (!op1->isContained())
-            {
-                // If the index is not a constant, we will use the SIMD temp location to store the vector.
-                // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
-                // can use that in the process of extracting the element.
-                //
-                // If the index is a constant and base type is a small int we can use pextrw, but on AVX
-                // we will need a temp if are indexing into the upper half of the AVX register.
-                // In all other cases with constant index, we need a temp xmm register to extract the
-                // element if index is other than zero.
-
-                if (!op2->IsCnsIntOrI())
-                {
-                    (void)compiler->getSIMDInitTempVarNum();
-                }
-                else if (!varTypeIsFloating(simdTree->gtSIMDBaseType))
-                {
-                    bool needFloatTemp;
-                    if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) &&
-                        (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
-                    {
-                        int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType);
-                        needFloatTemp    = (byteShiftCnt >= 16);
-                    }
-                    else
-                    {
-                        needFloatTemp = !op2->IsIntegralConst(0);
-                    }
-
-                    if (needFloatTemp)
-                    {
-                        buildInternalFloatRegisterDefForNode(simdTree);
-                    }
-                }
-#ifdef TARGET_X86
-                // This logic is duplicated from genSIMDIntrinsicGetItem().
-                // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
-                // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
-                // cases will require this, so the non-byteable registers can be excluded.
-
-                var_types baseType = simdTree->gtSIMDBaseType;
-                if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
-                {
-                    bool     ZeroOrSignExtnReqd = true;
-                    unsigned baseSize           = genTypeSize(baseType);
-                    if (baseSize == 1)
-                    {
-                        if ((op2->AsIntCon()->gtIconVal % 2) == 1)
-                        {
-                            ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
-                        }
-                    }
-                    else
-                    {
-                        assert(baseSize == 2);
-                        ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
-                    }
-                    if (ZeroOrSignExtnReqd)
-                    {
-                        dstCandidates = allByteRegs();
-                    }
-                }
-#endif // TARGET_X86
-            }
-        }
-        break;
-
-        case SIMDIntrinsicSetX:
-        case SIMDIntrinsicSetY:
-        case SIMDIntrinsicSetZ:
-        case SIMDIntrinsicSetW:
-            // We need an internal integer register for SSE2 codegen
-            if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
-            {
-                buildInternalIntRegisterDefForNode(simdTree);
-            }
-
-            break;
-
-        case SIMDIntrinsicCast:
-            break;
-
-        case SIMDIntrinsicConvertToSingle:
-            if (simdTree->gtSIMDBaseType == TYP_UINT)
-            {
-                // We need an internal register different from targetReg.
-                setInternalRegsDelayFree = true;
-                buildInternalFloatRegisterDefForNode(simdTree);
-                buildInternalFloatRegisterDefForNode(simdTree);
-                // We also need an integer register.
-                buildInternalIntRegisterDefForNode(simdTree);
-            }
-            break;
-
-        case SIMDIntrinsicConvertToInt32:
-            break;
-
-        case SIMDIntrinsicWidenLo:
-        case SIMDIntrinsicWidenHi:
-            if (varTypeIsIntegral(simdTree->gtSIMDBaseType))
-            {
-                // We need an internal register different from targetReg.
-                setInternalRegsDelayFree = true;
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
-            break;
-
-        case SIMDIntrinsicConvertToInt64:
-            // We need an internal register different from targetReg.
-            setInternalRegsDelayFree = true;
-            buildInternalFloatRegisterDefForNode(simdTree);
-            if (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)
-            {
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
-            // We also need an integer register.
-            buildInternalIntRegisterDefForNode(simdTree);
-            break;
-
-        case SIMDIntrinsicConvertToDouble:
-            // We need an internal register different from targetReg.
-            setInternalRegsDelayFree = true;
-            buildInternalFloatRegisterDefForNode(simdTree);
-#ifdef TARGET_X86
-            if (simdTree->gtSIMDBaseType == TYP_LONG)
-            {
-                buildInternalFloatRegisterDefForNode(simdTree);
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
-            else
-#endif
-                if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) || (simdTree->gtSIMDBaseType == TYP_ULONG))
-            {
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
-            // We also need an integer register.
-            buildInternalIntRegisterDefForNode(simdTree);
-            break;
-
-        case SIMDIntrinsicNarrow:
-            // We need an internal register different from targetReg.
-            setInternalRegsDelayFree = true;
-            buildInternalFloatRegisterDefForNode(simdTree);
-            if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->gtSIMDBaseType != TYP_DOUBLE))
-            {
-                buildInternalFloatRegisterDefForNode(simdTree);
-            }
-            break;
-
-        case SIMDIntrinsicShuffleSSE2:
-            // Second operand is an integer constant and marked as contained.
-            assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
-            break;
-
-        case SIMDIntrinsicGetX:
-        case SIMDIntrinsicGetY:
-        case SIMDIntrinsicGetZ:
-        case SIMDIntrinsicGetW:
-            assert(!"Get intrinsics should not be seen during Lowering.");
-            unreached();
-
-        default:
-            noway_assert(!"Unimplemented SIMD node type.");
-            unreached();
-    }
-    if (buildUses)
-    {
-        assert(!op1->OperIs(GT_LIST));
-        assert(srcCount == 0);
-        // This is overly conservative, but is here for zero diffs.
-        srcCount = BuildRMWUses(simdTree);
-    }
-    buildInternalRegisterUses();
-    BuildDef(simdTree, dstCandidates);
-    return srcCount;
-}
-#endif // FEATURE_SIMD
-
-#ifdef FEATURE_HW_INTRINSICS
-//------------------------------------------------------------------------
-// BuildHWIntrinsic: Set the NodeInfo for a GT_HWINTRINSIC tree.
-//
-// Arguments:
-//    tree       - The GT_HWINTRINSIC node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
-{
-    NamedIntrinsic         intrinsicId = intrinsicTree->gtHWIntrinsicId;
-    var_types              baseType    = intrinsicTree->gtSIMDBaseType;
-    CORINFO_InstructionSet isa         = HWIntrinsicInfo::lookupIsa(intrinsicId);
-    HWIntrinsicCategory    category    = HWIntrinsicInfo::lookupCategory(intrinsicId);
-    int                    numArgs     = HWIntrinsicInfo::lookupNumArgs(intrinsicTree);
-
-    // Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
-    // Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
-    // or non-AVX intrinsics that will use VEX encoding if it is available on the target).
-    if (intrinsicTree->isSIMD())
-    {
-        SetContainsAVXFlags(intrinsicTree->gtSIMDSize);
-    }
-
-    GenTree* op1    = intrinsicTree->gtGetOp1();
-    GenTree* op2    = intrinsicTree->gtGetOp2();
-    GenTree* op3    = nullptr;
-    GenTree* lastOp = nullptr;
-
-    int srcCount = 0;
-    int dstCount = intrinsicTree->IsValue() ? 1 : 0;
-
-    regMaskTP dstCandidates = RBM_NONE;
-
-    if (op1 == nullptr)
-    {
-        assert(op2 == nullptr);
-        assert(numArgs == 0);
-    }
-    else
-    {
-        if (op1->OperIsList())
-        {
-            assert(op2 == nullptr);
-            assert(numArgs >= 3);
-
-            GenTreeArgList* argList = op1->AsArgList();
-
-            op1     = argList->Current();
-            argList = argList->Rest();
-
-            op2     = argList->Current();
-            argList = argList->Rest();
-
-            op3 = argList->Current();
-
-            while (argList->Rest() != nullptr)
-            {
-                argList = argList->Rest();
-            }
-
-            lastOp  = argList->Current();
-            argList = argList->Rest();
-
-            assert(argList == nullptr);
-        }
-        else if (op2 != nullptr)
-        {
-            assert(numArgs == 2);
-            lastOp = op2;
-        }
-        else
-        {
-            assert(numArgs == 1);
-            lastOp = op1;
-        }
-
-        assert(lastOp != nullptr);
-
-        bool buildUses = true;
-
-        if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
-        {
-            if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
-            {
-                assert(!lastOp->IsCnsIntOrI());
-
-                // We need two extra reg when lastOp isn't a constant so
-                // the offset into the jump table for the fallback path
-                // can be computed.
-                buildInternalIntRegisterDefForNode(intrinsicTree);
-                buildInternalIntRegisterDefForNode(intrinsicTree);
-            }
-        }
-
-        // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
-        // is not allocated the same register as the target.
-        bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
-
-        // Create internal temps, and handle any other special requirements.
-        // Note that the default case for building uses will handle the RMW flag, but if the uses
-        // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
-        // must be handled within the case.
-        switch (intrinsicId)
-        {
-            case NI_Vector128_CreateScalarUnsafe:
-            case NI_Vector128_ToScalar:
-            case NI_Vector256_CreateScalarUnsafe:
-            case NI_Vector256_ToScalar:
-            {
-                assert(numArgs == 1);
-
-                if (varTypeIsFloating(baseType))
-                {
-                    if (op1->isContained())
-                    {
-                        srcCount += BuildOperandUses(op1);
-                    }
-                    else
-                    {
-                        // We will either be in memory and need to be moved
-                        // into a register of the appropriate size or we
-                        // are already in an XMM/YMM register and can stay
-                        // where we are.
-
-                        tgtPrefUse = BuildUse(op1);
-                        srcCount += 1;
-                    }
-
-                    buildUses = false;
-                }
-                break;
-            }
-
-            case NI_Vector128_ToVector256:
-            case NI_Vector128_ToVector256Unsafe:
-            case NI_Vector256_GetLower:
-            {
-                assert(numArgs == 1);
-
-                if (op1->isContained())
-                {
-                    srcCount += BuildOperandUses(op1);
-                }
-                else
-                {
-                    // We will either be in memory and need to be moved
-                    // into a register of the appropriate size or we
-                    // are already in an XMM/YMM register and can stay
-                    // where we are.
-
-                    tgtPrefUse = BuildUse(op1);
-                    srcCount += 1;
-                }
-
-                buildUses = false;
-                break;
-            }
-
-            case NI_SSE2_MaskMove:
-            {
-                assert(numArgs == 3);
-                assert(!isRMW);
-
-                // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
-                srcCount += BuildOperandUses(op1);
-                srcCount += BuildOperandUses(op2);
-                srcCount += BuildOperandUses(op3, RBM_EDI);
-
-                buildUses = false;
-                break;
-            }
-
-            case NI_SSE41_BlendVariable:
-            {
-                assert(numArgs == 3);
-
-                if (!compiler->canUseVexEncoding())
-                {
-                    assert(isRMW);
-
-                    // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
-                    tgtPrefUse = BuildUse(op1);
-
-                    srcCount += 1;
-                    srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1);
-                    srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0);
-
-                    buildUses = false;
-                }
-                break;
-            }
-
-            case NI_SSE41_Extract:
-            {
-                if (baseType == TYP_FLOAT)
-                {
-                    buildInternalIntRegisterDefForNode(intrinsicTree);
-                }
-#ifdef TARGET_X86
-                else if (varTypeIsByte(baseType))
-                {
-                    dstCandidates = allByteRegs();
-                }
-#endif
-                break;
-            }
-
-#ifdef TARGET_X86
-            case NI_SSE42_Crc32:
-            case NI_SSE42_X64_Crc32:
-            {
-                // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
-                // to the code generator. We may want to encode the overload info in another way.
-
-                assert(numArgs == 2);
-                assert(isRMW);
-
-                // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
-                tgtPrefUse = BuildUse(op1);
-
-                srcCount += 1;
-                srcCount += BuildDelayFreeUses(op2, op1, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
-
-                buildUses = false;
-                break;
-            }
-#endif // TARGET_X86
-
-            case NI_BMI2_MultiplyNoFlags:
-            case NI_BMI2_X64_MultiplyNoFlags:
-            {
-                assert(numArgs == 2 || numArgs == 3);
-                srcCount += BuildOperandUses(op1, RBM_EDX);
-                srcCount += BuildOperandUses(op2);
-                if (numArgs == 3)
-                {
-                    // op3 reg should be different from target reg to
-                    // store the lower half result after executing the instruction
-                    srcCount += BuildDelayFreeUses(op3, op1);
-                    // Need a internal register different from the dst to take the lower half result
-                    buildInternalIntRegisterDefForNode(intrinsicTree);
-                    setInternalRegsDelayFree = true;
-                }
-                buildUses = false;
-                break;
-            }
-
-            case NI_FMA_MultiplyAdd:
-            case NI_FMA_MultiplyAddNegated:
-            case NI_FMA_MultiplyAddNegatedScalar:
-            case NI_FMA_MultiplyAddScalar:
-            case NI_FMA_MultiplyAddSubtract:
-            case NI_FMA_MultiplySubtract:
-            case NI_FMA_MultiplySubtractAdd:
-            case NI_FMA_MultiplySubtractNegated:
-            case NI_FMA_MultiplySubtractNegatedScalar:
-            case NI_FMA_MultiplySubtractScalar:
-            {
-                assert(numArgs == 3);
-                assert(isRMW);
-
-                const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
-
-                // Intrinsics with CopyUpperBits semantics cannot have op1 be contained
-                assert(!copiesUpperBits || !op1->isContained());
-
-                if (op2->isContained())
-                {
-                    // 132 form: op1 = (op1 * op3) + [op2]
-
-                    tgtPrefUse = BuildUse(op1);
-
-                    srcCount += 1;
-                    srcCount += BuildOperandUses(op2);
-                    srcCount += BuildDelayFreeUses(op3, op1);
-                }
-                else if (op1->isContained())
-                {
-                    // 231 form: op3 = (op2 * op3) + [op1]
-
-                    tgtPrefUse = BuildUse(op3);
-
-                    srcCount += BuildOperandUses(op1);
-                    srcCount += BuildDelayFreeUses(op2, op1);
-                    srcCount += 1;
-                }
-                else
-                {
-                    // 213 form: op1 = (op2 * op1) + [op3]
-
-                    tgtPrefUse = BuildUse(op1);
-                    srcCount += 1;
-
-                    if (copiesUpperBits)
-                    {
-                        srcCount += BuildDelayFreeUses(op2, op1);
-                    }
-                    else
-                    {
-                        tgtPrefUse2 = BuildUse(op2);
-                        srcCount += 1;
-                    }
-
-                    srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1);
-                }
-
-                buildUses = false;
-                break;
-            }
-
-            case NI_AVX2_GatherVector128:
-            case NI_AVX2_GatherVector256:
-            {
-                assert(numArgs == 3);
-                assert(!isRMW);
-
-                // Any pair of the index, mask, or destination registers should be different
-                srcCount += BuildOperandUses(op1);
-                srcCount += BuildDelayFreeUses(op2, op1);
-
-                // op3 should always be contained
-                assert(op3->isContained());
-
-                // get a tmp register for mask that will be cleared by gather instructions
-                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
-                setInternalRegsDelayFree = true;
-
-                buildUses = false;
-                break;
-            }
-
-            case NI_AVX2_GatherMaskVector128:
-            case NI_AVX2_GatherMaskVector256:
-            {
-                assert(numArgs == 5);
-                assert(!isRMW);
-                assert(intrinsicTree->gtGetOp1()->OperIsList());
-
-                GenTreeArgList* argList = intrinsicTree->gtGetOp1()->AsArgList()->Rest()->Rest()->Rest();
-                GenTree*        op4     = argList->Current();
-
-                // Any pair of the index, mask, or destination registers should be different
-                srcCount += BuildOperandUses(op1);
-                srcCount += BuildDelayFreeUses(op2);
-                srcCount += BuildDelayFreeUses(op3);
-                srcCount += BuildDelayFreeUses(op4);
-
-                // op5 should always be contained
-                assert(argList->Rest()->Current()->isContained());
-
-                // get a tmp register for mask that will be cleared by gather instructions
-                buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs());
-                setInternalRegsDelayFree = true;
-
-                buildUses = false;
-                break;
-            }
-
-            default:
-            {
-                assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
-                break;
-            }
-        }
-
-        if (buildUses)
-        {
-            assert((numArgs > 0) && (numArgs < 4));
-
-            if (intrinsicTree->OperIsMemoryLoadOrStore())
-            {
-                srcCount += BuildAddrUses(op1);
-            }
-            else if (isRMW && !op1->isContained())
-            {
-                tgtPrefUse = BuildUse(op1);
-                srcCount += 1;
-            }
-            else
-            {
-                srcCount += BuildOperandUses(op1);
-            }
-
-            if (op2 != nullptr)
-            {
-                if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
-                {
-                    srcCount += BuildAddrUses(op2->gtGetOp1());
-                }
-                else if (isRMW)
-                {
-                    if (!op2->isContained() && HWIntrinsicInfo::IsCommutative(intrinsicId))
-                    {
-                        // When op2 is not contained and we are commutative, we can set op2
-                        // to also be a tgtPrefUse. Codegen will then swap the operands.
-
-                        tgtPrefUse2 = BuildUse(op2);
-                        srcCount += 1;
-                    }
-                    else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet()))
-                    {
-                        // When op2 is not contained or if we are producing a scalar value
-                        // we need to mark it as delay free because the operand and target
-                        // exist in the same register set.
-
-                        srcCount += BuildDelayFreeUses(op2);
-                    }
-                    else
-                    {
-                        // When op2 is contained and we are not producing a scalar value we
-                        // have no concerns of overwriting op2 because they exist in different
-                        // register sets.
-
-                        srcCount += BuildOperandUses(op2);
-                    }
-                }
-                else
-                {
-                    srcCount += BuildOperandUses(op2);
-                }
-
-                if (op3 != nullptr)
-                {
-                    srcCount += isRMW ? BuildDelayFreeUses(op3) : BuildOperandUses(op3);
-                }
-            }
-        }
-
-        buildInternalRegisterUses();
-    }
-
-    if (dstCount == 1)
-    {
-        BuildDef(intrinsicTree, dstCandidates);
-    }
-    else
-    {
-        assert(dstCount == 0);
-    }
-
-    return srcCount;
-}
-#endif
-
-//------------------------------------------------------------------------
-// BuildCast: Set the NodeInfo for a GT_CAST.
-//
-// Arguments:
-//    cast - The GT_CAST node
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildCast(GenTreeCast* cast)
-{
-    GenTree* src = cast->gtGetOp1();
-
-    const var_types srcType  = genActualType(src->TypeGet());
-    const var_types castType = cast->gtCastType;
-
-    regMaskTP candidates = RBM_NONE;
-#ifdef TARGET_X86
-    if (varTypeIsByte(castType))
-    {
-        candidates = allByteRegs();
-    }
-
-    assert(!varTypeIsLong(srcType) || (src->OperIs(GT_LONG) && src->isContained()));
-#else
-    // Overflow checking cast from TYP_(U)LONG to TYP_UINT requires a temporary
-    // register to extract the upper 32 bits of the 64 bit source register.
-    if (cast->gtOverflow() && varTypeIsLong(srcType) && (castType == TYP_UINT))
-    {
-        // Here we don't need internal register to be different from targetReg,
-        // rather require it to be different from operand's reg.
-        buildInternalIntRegisterDefForNode(cast);
-    }
-#endif
-
-    int srcCount = BuildOperandUses(src, candidates);
-    buildInternalRegisterUses();
-    BuildDef(cast, candidates);
-    return srcCount;
-}
-
-//-----------------------------------------------------------------------------------------
-// BuildIndir: Specify register requirements for address expression of an indirection operation.
-//
-// Arguments:
-//    indirTree    -   GT_IND or GT_STOREIND gentree node
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildIndir(GenTreeIndir* indirTree)
-{
-    assert(false);
-    return 0;
-}
-
-//------------------------------------------------------------------------
-// BuildMul: Set the NodeInfo for a multiply.
-//
-// Arguments:
-//    tree      - The node of interest
-//
-// Return Value:
-//    The number of sources consumed by this node.
-//
-int LinearScan::BuildMul(GenTree* tree)
-{
-    assert(tree->OperIsMul());
-    GenTree* op1 = tree->gtGetOp1();
-    GenTree* op2 = tree->gtGetOp2();
-
-    // Only non-floating point mul has special requirements
-    if (varTypeIsFloating(tree->TypeGet()))
-    {
-        return BuildSimple(tree);
-    }
-
-    int       srcCount      = BuildBinaryUses(tree->AsOp());
-    int       dstCount      = 1;
-    regMaskTP dstCandidates = RBM_NONE;
-
-    bool isUnsignedMultiply    = ((tree->gtFlags & GTF_UNSIGNED) != 0);
-    bool requiresOverflowCheck = tree->gtOverflowEx();
-
-    // There are three forms of x86 multiply:
-    // one-op form:     RDX:RAX = RAX * r/m
-    // two-op form:     reg *= r/m
-    // three-op form:   reg = r/m * imm
-
-    // This special widening 32x32->64 MUL is not used on x64
-    CLANG_FORMAT_COMMENT_ANCHOR;
-#if defined(TARGET_X86)
-    if (tree->OperGet() != GT_MUL_LONG)
-#endif
-    {
-        assert((tree->gtFlags & GTF_MUL_64RSLT) == 0);
-    }
-
-    // We do use the widening multiply to implement
-    // the overflow checking for unsigned multiply
-    //
-    if (isUnsignedMultiply && requiresOverflowCheck)
-    {
-        // The only encoding provided is RDX:RAX = RAX * rm
-        //
-        // Here we set RAX as the only destination candidate
-        // In LSRA we set the kill set for this operation to RBM_RAX|RBM_RDX
-        //
-        dstCandidates = RBM_RAX;
-    }
-    else if (tree->OperGet() == GT_MULHI)
-    {
-        // Have to use the encoding:RDX:RAX = RAX * rm. Since we only care about the
-        // upper 32 bits of the result set the destination candidate to REG_RDX.
-        dstCandidates = RBM_RDX;
-    }
-#if defined(TARGET_X86)
-    else if (tree->OperGet() == GT_MUL_LONG)
-    {
-        // have to use the encoding:RDX:RAX = RAX * rm
-        dstCandidates = RBM_RAX | RBM_RDX;
-        dstCount      = 2;
-    }
-#endif
-    GenTree* containedMemOp = nullptr;
-    if (op1->isContained() && !op1->IsCnsIntOrI())
-    {
-        assert(!op2->isContained() || op2->IsCnsIntOrI());
-        containedMemOp = op1;
-    }
-    else if (op2->isContained() && !op2->IsCnsIntOrI())
-    {
-        containedMemOp = op2;
-    }
-    regMaskTP killMask = getKillSetForMul(tree->AsOp());
-    BuildDefsWithKills(tree, dstCount, dstCandidates, killMask);
-    return srcCount;
-}
-
-//------------------------------------------------------------------------------
-// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
-// Contains256bitAVX flag when SIMD vector size is 32 bytes
-//
-// Arguments:
-//    isFloatingPointType   - true if it is floating point type
-//    sizeOfSIMDVector      - SIMD Vector size
-//
-void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/)
-{
-    assert(false);
-}
-
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h
index c0d565416cce..d6ce4be200e7 100644
--- a/src/coreclr/jit/register.h
+++ b/src/coreclr/jit/register.h
@@ -13,7 +13,7 @@
 #endif
 
 // TODO: WASM doesn't have these but can't compile without them
-#if defined(TARGET_XARCH) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_XARCH) || defined(TARGET_WASM)
 
 #if defined(TARGET_X86)
 /*
@@ -70,7 +70,7 @@ REGALIAS(EDI, RDI)
 #ifdef TARGET_AMD64
 #define XMMBASE 16
 #define XMMMASK(x) (__int64(1) << ((x)+XMMBASE))
-#elif defined(TARGET_WASM32) || defined(TARGET_WSM64)
+#elif defined(TARGET_WASM)
 #define XMMBASE 16
 #define XMMMASK(x) (__int64(1) << ((x)+XMMBASE))
 #else // !TARGET_AMD64
@@ -107,7 +107,7 @@ REGDEF(STK,    16+XMMBASE,  0x0000,       "STK"  )
 #elif defined(TARGET_ARM64)
  #include "registerarm64.h"
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #else
   #error Unsupported or unset target architecture
 #endif // target type
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
index 235b6a99d626..4cd84e8d0347 100644
--- a/src/coreclr/jit/simd.h
+++ b/src/coreclr/jit/simd.h
@@ -33,7 +33,7 @@ enum SIMDLevel
     // Vector<T> length is 256-bit and SIMD instructions are VEX-256 encoded.
     // Floating-point instructions are VEX-128 encoded.
     SIMD_AVX2_Supported = 3
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     // SSE2 - The min bar of SIMD ISA on x86/x64.
     // Vector<T> length is 128-bit.
     // Floating-point instructions are legacy SSE encoded.
diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h
index 5e04e6a1b826..72e162cf8b86 100644
--- a/src/coreclr/jit/target.h
+++ b/src/coreclr/jit/target.h
@@ -45,7 +45,7 @@
 #define REGMASK_BITS 64
 #define CSE_CONST_SHARED_LOW_BITS 12
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 #define REGMASK_BITS 32
 #define CSE_CONST_SHARED_LOW_BITS 16
 #else
@@ -148,7 +148,7 @@ enum _regMask_enum : unsigned
 #include "register.h"
 };
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 enum _regNumber_enum : unsigned
 {
 #define REGDEF(name, rnum, mask, sname) REG_##name = rnum,
@@ -1583,7 +1583,7 @@ typedef unsigned char   regNumberSmall;
   // have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).
   #define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
 
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)  // TODO: a copy of X64
+#elif defined(TARGET_WASM)  // TODO: a copy of X64
 #define RBM_LNGRET_LO            RBM_EAX
 #define REG_LNGRET_HI            REG_EDX
 #define RBM_LNGRET_HI            RBM_EDX
@@ -2426,7 +2426,7 @@ inline regMaskTP genRegMaskFloat(regNumber reg, var_types type = TYP_DOUBLE);
  */
 inline bool genIsValidReg(regNumber reg)
 {
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64) // infinite "registers"
+#if defined(TARGET_WASM) // infinite "registers"
     return true;
 #else
     /* It's safest to perform an unsigned comparison in case reg is negative */
@@ -2541,7 +2541,7 @@ inline regMaskTP fullIntArgRegMask()
 //
 inline bool isValidIntArgReg(regNumber reg)
 {
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
     return true;
 #else
     return (genRegMask(reg) & fullIntArgRegMask()) != 0;
@@ -2622,7 +2622,7 @@ inline regMaskTP genRegMask(regNumber reg)
     regMaskTP result = 1 << reg;
     assert(result == regMasks[reg]);
     return result;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
     regMaskTP result = 1 << reg;
     assert(result == regMasks[reg]);
     return result;
@@ -2638,7 +2638,7 @@ inline regMaskTP genRegMask(regNumber reg)
 
 inline regMaskTP genRegMaskFloat(regNumber reg, var_types type /* = TYP_DOUBLE */)
 {
-#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_WASM)
     assert(genIsValidFloatReg(reg));
     assert((unsigned)reg < ArrLen(regMasks));
     return regMasks[reg];
diff --git a/src/coreclr/jit/targetwasm.cpp b/src/coreclr/jit/targetwasm.cpp
index 7f752a5364b0..501a480b2edd 100644
--- a/src/coreclr/jit/targetwasm.cpp
+++ b/src/coreclr/jit/targetwasm.cpp
@@ -8,7 +8,7 @@
 #pragma hdrstop
 #endif
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
 
 #include "target.h"
 
diff --git a/src/coreclr/jit/unwindwasm.cpp b/src/coreclr/jit/unwindwasm.cpp
index 6cf194c762fd..4306917bc131 100644
--- a/src/coreclr/jit/unwindwasm.cpp
+++ b/src/coreclr/jit/unwindwasm.cpp
@@ -15,7 +15,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #pragma hdrstop
 #endif
 
-#if defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#if defined(TARGET_WASM)
 typedef union _UNWIND_CODE {
     struct {
         UCHAR CodeOffset;
@@ -423,4 +423,4 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode
     assert(false);
 }
 
-#endif // defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#endif // defined(TARGET_WASM)
diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
index 36e41cf11af5..768611b2dda5 100644
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@@ -335,7 +335,7 @@ void dspRegMask(regMaskTP regMask, size_t minSiz)
                 }
 #elif defined(TARGET_X86) 
 // No register ranges
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64) // TODO Wasm
+#elif defined(TARGET_WASM) // TODO Wasm
                 // For AMD64, create ranges for int registers R8 through R15, but not the "old" registers.
                 if (regNum >= REG_R8)
                 {
diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp
index bf106795dd32..d197285c86ec 100644
--- a/src/coreclr/jit/valuenum.cpp
+++ b/src/coreclr/jit/valuenum.cpp
@@ -57,7 +57,7 @@ struct FloatTraits
         unsigned bits = 0xFFC00000u;
 #elif defined(TARGET_ARMARCH)
         unsigned           bits = 0x7FC00000u;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
         unsigned           bits = 0x7FC00000u;
 #else
 #error Unsupported or unset target architecture
@@ -85,7 +85,7 @@ struct DoubleTraits
         unsigned long long bits = 0xFFF8000000000000ull;
 #elif defined(TARGET_ARMARCH)
         unsigned long long bits = 0x7FF8000000000000ull;
-#elif defined(TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
         unsigned long long bits = 0xFFF8000000000000ull;
 #else
 #error Unsupported or unset target architecture
diff --git a/src/coreclr/jit/valuenumfuncs.h b/src/coreclr/jit/valuenumfuncs.h
index 6c5fb3d81b65..5167eb7dff05 100644
--- a/src/coreclr/jit/valuenumfuncs.h
+++ b/src/coreclr/jit/valuenumfuncs.h
@@ -180,7 +180,7 @@ ValueNumFuncDef(HWI_##isa##_##name, argCount, false, false, false)   // All of t
 
 #elif defined (TARGET_ARM)
 // No Hardware Intrinsics on ARM32
-#elif defined (TARGET_WASM32) || defined(TARGET_WASM64)
+#elif defined(TARGET_WASM)
 // No Hardware Intrinsics on WebAssembly
 #else
 #error Unsupported platform

From 13a069513509dfa4552f16d7149c656dda73fac9 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Feb 2021 21:39:49 -0500
Subject: [PATCH 09/44] typo on #endif

---
 src/coreclr/jit/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 554acf735a32..3413a3adf15c 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -4739,7 +4739,7 @@ class Compiler
 
 #ifndef TARGET_WASM
     void fgSetOptions();
-#endif !TARGET_WASM
+#endif// !TARGET_WASM
 
 #ifdef DEBUG
     static fgWalkPreFn fgAssertNoQmark;

From 70750e13742732753cd0f2bfca19f2a8c620d481 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 8 Feb 2021 14:41:20 -0500
Subject: [PATCH 10/44] remove inst* with ifdef

remove stacklevelsetter with ifdef
fix ifdef syntax
---
 src/coreclr/jit/compiler.cpp         |   6 +-
 src/coreclr/jit/compiler.h           |  23 +-
 src/coreclr/jit/importer.cpp         |   4 +
 src/coreclr/jit/instr.h              |  15 +-
 src/coreclr/jit/instrs.h             |   4 +-
 src/coreclr/jit/instrswasm.h         | 774 ---------------------------
 src/coreclr/jit/jitgcinfo.h          |   2 +
 src/coreclr/jit/lsra.h               |  19 +-
 src/coreclr/jit/stacklevelsetter.cpp |  11 +-
 src/coreclr/jit/stacklevelsetter.h   |   4 +
 src/coreclr/jit/typelist.h           |   4 +
 src/coreclr/tools/aot/ilc.sln        |  18 +
 12 files changed, 47 insertions(+), 837 deletions(-)
 delete mode 100644 src/coreclr/jit/instrswasm.h

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 885c1f602783..3b70be9b9097 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4854,7 +4854,7 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
 #ifndef TARGET_WASM
         // Decide the kind of code we want to generate
         fgSetOptions();
-#endif !TARGET_WASM
+#endif // !TARGET_WASM
 
         fgExpandQmarkNodes();
 
@@ -8656,7 +8656,7 @@ void cVarsFinal(Compiler* comp)
     printf("===================================================================== *Vars %u\n", sequenceNumber++);
     comp->lvaTableDump(Compiler::FINAL_FRAME_LAYOUT);
 }
-#endif !TARGET_WASM
+#endif // !TARGET_WASM
 
 void cBlockCheapPreds(Compiler* comp, BasicBlock* block)
 {
@@ -8785,7 +8785,7 @@ void dVarsFinal()
 {
     cVarsFinal(JitTls::GetCompiler());
 }
-#endif !TARGET_WASM
+#endif // !TARGET_WASM
 
 void dBlockPreds(BasicBlock* block)
 {
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 3413a3adf15c..a3ab9051b311 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -4739,7 +4739,7 @@ class Compiler
 
 #ifndef TARGET_WASM
     void fgSetOptions();
-#endif// !TARGET_WASM
+#endif // !TARGET_WASM
 
 #ifdef DEBUG
     static fgWalkPreFn fgAssertNoQmark;
@@ -11266,27 +11266,6 @@ const instruction INS_SQRT = INS_fsqrt;
 
 #endif // TARGET_ARM64
 
-#if defined(TARGET_WASM)
-
-const instruction INS_SHIFT_LEFT_LOGICAL = INS_shl;
-const instruction INS_SHIFT_RIGHT_LOGICAL = INS_shr;
-const instruction INS_SHIFT_RIGHT_ARITHM = INS_sar;
-
-const instruction INS_AND = INS_and;
-const instruction INS_OR = INS_or;
-const instruction INS_XOR = INS_xor;
-const instruction INS_NEG = INS_neg;
-const instruction INS_TEST = INS_test;
-const instruction INS_MUL = INS_imul;
-const instruction INS_SIGNED_DIVIDE = INS_idiv;
-const instruction INS_UNSIGNED_DIVIDE = INS_div;
-const instruction INS_BREAKPOINT = INS_int3;
-const instruction INS_ADDC = INS_adc;
-const instruction INS_SUBC = INS_sbb;
-const instruction INS_NOT = INS_not;
-
-#endif // defined(TARGET_WASM)
-
 /*****************************************************************************/
 
 #ifndef TARGET_WASM
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index c138d92d330a..f7bfc198c552 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -19690,7 +19690,11 @@ void Compiler::impInlineInitVars(InlineInfo* pInlineInfo)
                         return;
                     }
                 }
+#ifndef TARGET_WASM
                 else if (genTypeSize(sigType) < EA_PTRSIZE)
+#else
+                else if (genTypeSize(sigType) < TARGET_POINTER_SIZE) // TODO: is this sensible in the abscence of EA_PTRSIZE?
+#endif
                 {
                     // Narrowing cast.
                     if (inlArgNode->OperIs(GT_LCL_VAR))
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index 862616b28860..b6ebbd78a86d 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 /*****************************************************************************/
 
+#ifndef TARGET_WASM
 #ifndef _INSTR_H_
 #define _INSTR_H_
 /*****************************************************************************/
@@ -114,19 +115,6 @@ enum insFlags: unsigned
     INS_FLAGS_SET = 0x01,
     INS_FLAGS_DONT_CARE = 0x02,
 };
-#elif defined(TARGET_WASM) // TODO : can this be removed/empty?
-enum insFlags: uint8_t
-{
-    INS_FLAGS_None = 0x00,
-    INS_FLAGS_ReadsFlags = 0x01,
-    INS_FLAGS_WritesFlags = 0x02,
-    INS_FLAGS_x87Instr = 0x04,
-    INS_Flags_IsDstDstSrcAVXInstruction = 0x08,
-    INS_Flags_IsDstSrcSrcAVXInstruction = 0x10,
-
-    //  TODO-Cleanup:
-    INS_FLAGS_DONT_CARE = 0x00,
-};
 #else
 #error Unsupported target architecture
 #endif
@@ -317,3 +305,4 @@ enum emitAttr : unsigned
 /*****************************************************************************/
 #endif //_INSTR_H_
 /*****************************************************************************/
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/instrs.h b/src/coreclr/jit/instrs.h
index 4e9b1b1f648f..770c5a61e4f7 100644
--- a/src/coreclr/jit/instrs.h
+++ b/src/coreclr/jit/instrs.h
@@ -1,14 +1,14 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#ifndef TARGET_WASM
 #if defined(TARGET_XARCH)
 #include "instrsxarch.h"
 #elif defined(TARGET_ARM)
 #include "instrsarm.h"
 #elif defined(TARGET_ARM64)
 #include "instrsarm64.h"
-#elif defined(TARGET_WASM)
-#include "instrswasm.h"
 #else
 #error Unsupported or unset target architecture
 #endif // target type
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/instrswasm.h b/src/coreclr/jit/instrswasm.h
deleted file mode 100644
index a335394277f1..000000000000
--- a/src/coreclr/jit/instrswasm.h
+++ /dev/null
@@ -1,774 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-//
-//  This file was previously known as instrs.h
-//
-/*****************************************************************************
- *  x86 instructions for  the JIT compiler
- *
- *          id      -- the enum name for the instruction
- *          nm      -- textual name (for assembly dipslay)
- *          um      -- update mode, see IUM_xx enum (rd, wr, or rw)
- *          mr      -- base encoding for R/M[reg] addressing mode
- *          mi      -- base encoding for R/M,icon addressing mode
- *          rm      -- base encoding for reg,R/M  addressing mode
- *          a4      -- base encoding for eax,i32  addressing mode
- *          rr      -- base encoding for register addressing mode
- *          flags   -- flags, see INS_FLAGS_* enum
- *
-******************************************************************************/
-
-// clang-format off
-#if !defined(TARGET_WASM32) && !defined(TARGET_WASM64)
-  #error Unexpected target type
-#endif
-
-#ifndef INST1
-#error  At least INST1 must be defined before including this file.
-#endif
-/*****************************************************************************/
-#ifndef INST0
-#define INST0(id, nm, um, mr,                 flags)
-#endif
-#ifndef INST2
-#define INST2(id, nm, um, mr, mi,             flags)
-#endif
-#ifndef INST3
-#define INST3(id, nm, um, mr, mi, rm,         flags)
-#endif
-#ifndef INST4
-#define INST4(id, nm, um, mr, mi, rm, a4,     flags)
-#endif
-#ifndef INST5
-#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags)
-#endif
-
-/*****************************************************************************/
-/*               The following is x86-specific                               */
-/*****************************************************************************/
-
-//    id                nm                  um      mr            mi            rm            a4            rr           flags
-INST5(invalid,          "INVALID",          IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     BAD_CODE,    INS_FLAGS_None)
-
-INST5(push,             "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None)
-INST5(pop,              "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None)
-// Does not affect the stack tracking in the emitter
-INST5(push_hide,        "push",             IUM_RD, 0x0030FE,     0x000068,     BAD_CODE,     BAD_CODE,     0x000050,    INS_FLAGS_None)
-INST5(pop_hide,         "pop",              IUM_WR, 0x00008E,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000058,    INS_FLAGS_None)
-
-INST5(inc,              "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000040,    INS_FLAGS_WritesFlags)
-INST5(inc_l,            "inc",              IUM_RW, 0x0000FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C0FE,    INS_FLAGS_WritesFlags)
-INST5(dec,              "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x000048,    INS_FLAGS_WritesFlags)
-INST5(dec_l,            "dec",              IUM_RW, 0x0008FE,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C8FE,    INS_FLAGS_WritesFlags)
-
-// Multi-byte opcodes without modrm are represented in mixed endian fashion.
-// See comment around quarter way through this file for more information.
-INST5(bswap,            "bswap",            IUM_RW, 0x0F00C8,     BAD_CODE,     BAD_CODE,     BAD_CODE,     0x00C80F,    INS_FLAGS_None)
-
-//    id                nm                  um      mr            mi            rm            a4                         flags
-INST4(add,              "add",              IUM_RW, 0x000000,     0x000080,     0x000002,     0x000004,                  INS_FLAGS_WritesFlags)
-INST4(or,               "or",               IUM_RW, 0x000008,     0x000880,     0x00000A,     0x00000C,                  INS_FLAGS_WritesFlags)
-INST4(adc,              "adc",              IUM_RW, 0x000010,     0x001080,     0x000012,     0x000014,                  INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST4(sbb,              "sbb",              IUM_RW, 0x000018,     0x001880,     0x00001A,     0x00001C,                  INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST4(and,              "and",              IUM_RW, 0x000020,     0x002080,     0x000022,     0x000024,                  INS_FLAGS_WritesFlags)
-INST4(sub,              "sub",              IUM_RW, 0x000028,     0x002880,     0x00002A,     0x00002C,                  INS_FLAGS_WritesFlags)
-INST4(xor,              "xor",              IUM_RW, 0x000030,     0x003080,     0x000032,     0x000034,                  INS_FLAGS_WritesFlags)
-INST4(cmp,              "cmp",              IUM_RD, 0x000038,     0x003880,     0x00003A,     0x00003C,                  INS_FLAGS_WritesFlags)
-INST4(test,             "test",             IUM_RD, 0x000084,     0x0000F6,     0x000084,     0x0000A8,                  INS_FLAGS_WritesFlags)
-INST4(mov,              "mov",              IUM_WR, 0x000088,     0x0000C6,     0x00008A,     0x0000B0,                  INS_FLAGS_None)
-
-INST4(lea,              "lea",              IUM_WR, BAD_CODE,     BAD_CODE,     0x00008D,     BAD_CODE,                  INS_FLAGS_None)
-
-//    id                nm                  um      mr            mi            rm                                       flags
-
-// Note that emitter has only partial support for BT. It can only emit the reg,reg form
-// and the registers need to be reversed to get the correct encoding.
-INST3(bt,               "bt",               IUM_RD, 0x0F00A3,     BAD_CODE,     0x0F00A3,                                INS_FLAGS_WritesFlags)
-
-INST3(bsf,              "bsf",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BC,                                INS_FLAGS_WritesFlags)
-INST3(bsr,              "bsr",              IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BD,                                INS_FLAGS_WritesFlags)
-
-INST3(movsx,            "movsx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00BE,                                INS_FLAGS_None)
-#ifdef TARGET_AMD64
-INST3(movsxd,           "movsxd",           IUM_WR, BAD_CODE,     BAD_CODE,     0x4800000063,                            INS_FLAGS_None)
-#endif
-INST3(movzx,            "movzx",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F00B6,                                INS_FLAGS_None)
-
-INST3(cmovo,            "cmovo",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0040,                                INS_FLAGS_ReadsFlags)
-INST3(cmovno,           "cmovno",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0041,                                INS_FLAGS_ReadsFlags)
-INST3(cmovb,            "cmovb",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0042,                                INS_FLAGS_ReadsFlags)
-INST3(cmovae,           "cmovae",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0043,                                INS_FLAGS_ReadsFlags)
-INST3(cmove,            "cmove",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0044,                                INS_FLAGS_ReadsFlags)
-INST3(cmovne,           "cmovne",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0045,                                INS_FLAGS_ReadsFlags)
-INST3(cmovbe,           "cmovbe",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0046,                                INS_FLAGS_ReadsFlags)
-INST3(cmova,            "cmova",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0047,                                INS_FLAGS_ReadsFlags)
-INST3(cmovs,            "cmovs",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0048,                                INS_FLAGS_ReadsFlags)
-INST3(cmovns,           "cmovns",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F0049,                                INS_FLAGS_ReadsFlags)
-INST3(cmovp,            "cmovp",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004A,                                INS_FLAGS_ReadsFlags)
-INST3(cmovnp,           "cmovnp",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004B,                                INS_FLAGS_ReadsFlags)
-INST3(cmovl,            "cmovl",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004C,                                INS_FLAGS_ReadsFlags)
-INST3(cmovge,           "cmovge",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004D,                                INS_FLAGS_ReadsFlags)
-INST3(cmovle,           "cmovle",           IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004E,                                INS_FLAGS_ReadsFlags)
-INST3(cmovg,            "cmovg",            IUM_WR, BAD_CODE,     BAD_CODE,     0x0F004F,                                INS_FLAGS_ReadsFlags)
-
-INST3(xchg,             "xchg",             IUM_RW, 0x000086,     BAD_CODE,     0x000086,                                INS_FLAGS_None)
-INST3(imul,             "imul",             IUM_RW, 0x0F00AC,     BAD_CODE,     0x0F00AF,                                INS_FLAGS_WritesFlags) // op1 *= op2
-
-//    id                nm                  um      mr            mi            rm                                       flags
-
-// Instead of encoding these as 3-operand instructions, we encode them
-// as 2-operand instructions with the target register being implicit
-// implicit_reg = op1*op2_icon
-#define INSTMUL INST3
-INSTMUL(imul_AX,        "imul",             IUM_RD, BAD_CODE,     0x000068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_CX,        "imul",             IUM_RD, BAD_CODE,     0x000868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_DX,        "imul",             IUM_RD, BAD_CODE,     0x001068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_BX,        "imul",             IUM_RD, BAD_CODE,     0x001868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_SP,        "imul",             IUM_RD, BAD_CODE,     BAD_CODE,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_BP,        "imul",             IUM_RD, BAD_CODE,     0x002868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_SI,        "imul",             IUM_RD, BAD_CODE,     0x003068,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_DI,        "imul",             IUM_RD, BAD_CODE,     0x003868,     BAD_CODE,                                INS_FLAGS_WritesFlags)
-
-#ifdef TARGET_AMD64
-
-INSTMUL(imul_08,        "imul",             IUM_RD, BAD_CODE,     0x4400000068, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_09,        "imul",             IUM_RD, BAD_CODE,     0x4400000868, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_10,        "imul",             IUM_RD, BAD_CODE,     0x4400001068, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_11,        "imul",             IUM_RD, BAD_CODE,     0x4400001868, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_12,        "imul",             IUM_RD, BAD_CODE,     0x4400002068, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_13,        "imul",             IUM_RD, BAD_CODE,     0x4400002868, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_14,        "imul",             IUM_RD, BAD_CODE,     0x4400003068, BAD_CODE,                                INS_FLAGS_WritesFlags)
-INSTMUL(imul_15,        "imul",             IUM_RD, BAD_CODE,     0x4400003868, BAD_CODE,                                INS_FLAGS_WritesFlags)
-
-#endif // TARGET_AMD64
-
-// the hex codes in this file represent the instruction encoding as follows:
-// 0x0000ff00 - modrm byte position
-// 0x000000ff - last byte of opcode (before modrm)
-// 0x00ff0000 - first byte of opcode
-// 0xff000000 - middle byte of opcode, if needed (after first, before last)
-//
-// So a 1-byte opcode is:      and with modrm:
-//             0x00000011          0x0000RM11
-//
-// So a 2-byte opcode is:      and with modrm:
-//             0x00002211          0x0011RM22
-//
-// So a 3-byte opcode is:      and with modrm:
-//             0x00113322          0x2211RM33
-//
-// So a 4-byte opcode would be something like this:
-//             0x22114433
-
-#define PACK3(byte1,byte2,byte3) (((byte1) << 16) | ((byte2) << 24) | (byte3))
-#define PACK2(byte1,byte2)                       (((byte1) << 16) | (byte2))
-#define SSEFLT(c) PACK3(0xf3, 0x0f, c)
-#define SSEDBL(c) PACK3(0xf2, 0x0f, c)
-#define PCKDBL(c) PACK3(0x66, 0x0f, c)
-#define PCKFLT(c) PACK2(0x0f,c)
-
-// These macros encode extra byte that is implicit in the macro.
-#define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
-#define SSE38(c)   PACK4(0x66, 0x0f, 0x38, c)
-#define SSE3A(c)   PACK4(0x66, 0x0f, 0x3A, c)
-
-// VEX* encodes the implied leading opcode bytes in c1:
-// 1: implied 0f, 2: implied 0f 38, 3: implied 0f 3a
-#define VEX2INT(c1,c2)   PACK3(c1, 0xc5, c2)
-#define VEX3INT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
-#define VEX3FLT(c1,c2)   PACK4(c1, 0xc5, 0x02, c2)
-
-INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-// These are the SSE instructions used on x86
-INST3(mov_i2xmm,        "movd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6E),                            INS_FLAGS_None)    // Move int reg to a xmm reg. reg1=xmm reg, reg2=int reg
-INST3(mov_xmm2i,        "movd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7E),                            INS_FLAGS_None)    // Move xmm reg to an int reg. reg1=xmm reg, reg2=int reg
-INST3(pmovmskb,         "pmovmskb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD7),                            INS_FLAGS_None)    // Move the MSB bits of all bytes in a xmm reg to an int reg
-INST3(movmskpd,         "movmskpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x50),                            INS_FLAGS_None)    // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros.
-INST3(movd,             "movd",             IUM_WR, PCKDBL(0x7E), BAD_CODE,     PCKDBL(0x6E),                            INS_FLAGS_None)
-INST3(movq,             "movq",             IUM_WR, PCKDBL(0xD6), BAD_CODE,     SSEFLT(0x7E),                            INS_FLAGS_None)
-INST3(movsdsse2,        "movsd",            IUM_WR, SSEDBL(0x11), BAD_CODE,     SSEDBL(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-
-INST3(punpckldq,        "punpckldq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x62),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(xorps,            "xorps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed singles
-
-INST3(cvttsd2si,        "cvttsd2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar double to signed DWORDs
-
-INST3(movntdq,          "movntdq",          IUM_WR, PCKDBL(0xE7), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movnti,           "movnti",           IUM_WR, PCKFLT(0xC3), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movntpd,          "movntpd",          IUM_WR, PCKDBL(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movntps,          "movntps",          IUM_WR, PCKFLT(0x2B), BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(movdqu,           "movdqu",           IUM_WR, SSEFLT(0x7F), BAD_CODE,     SSEFLT(0x6F),                            INS_FLAGS_None)
-INST3(movdqa,           "movdqa",           IUM_WR, PCKDBL(0x7F), BAD_CODE,     PCKDBL(0x6F),                            INS_FLAGS_None)
-INST3(movlpd,           "movlpd",           IUM_WR, PCKDBL(0x13), BAD_CODE,     PCKDBL(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movlps,           "movlps",           IUM_WR, PCKFLT(0x13), BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movhpd,           "movhpd",           IUM_WR, PCKDBL(0x17), BAD_CODE,     PCKDBL(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movhps,           "movhps",           IUM_WR, PCKFLT(0x17), BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movss,            "movss",            IUM_WR, SSEFLT(0x11), BAD_CODE,     SSEFLT(0x10),                            INS_Flags_IsDstSrcSrcAVXInstruction)
-INST3(movapd,           "movapd",           IUM_WR, PCKDBL(0x29), BAD_CODE,     PCKDBL(0x28),                            INS_FLAGS_None)
-INST3(movaps,           "movaps",           IUM_WR, PCKFLT(0x29), BAD_CODE,     PCKFLT(0x28),                            INS_FLAGS_None)
-INST3(movupd,           "movupd",           IUM_WR, PCKDBL(0x11), BAD_CODE,     PCKDBL(0x10),                            INS_FLAGS_None)
-INST3(movups,           "movups",           IUM_WR, PCKFLT(0x11), BAD_CODE,     PCKFLT(0x10),                            INS_FLAGS_None)
-INST3(movhlps,          "movhlps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x12),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(movlhps,          "movlhps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x16),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(movmskps,         "movmskps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x50),                            INS_FLAGS_None)
-INST3(unpckhps,         "unpckhps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(unpcklps,         "unpcklps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(maskmovdqu,       "maskmovdqu",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF7),                            INS_FLAGS_None)
-
-INST3(shufps,           "shufps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
-INST3(shufpd,           "shufpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC6),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(punpckhdq,        "punpckhdq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6A),                            INS_Flags_IsDstDstSrcAVXInstruction)
-
-INST3(lfence,           "lfence",           IUM_RD, 0x000FE8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(mfence,           "mfence",           IUM_RD, 0x000FF0AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetchnta,      "prefetchnta",      IUM_RD, 0x000F0018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht0,       "prefetcht0",       IUM_RD, 0x000F0818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht1,       "prefetcht1",       IUM_RD, 0x000F1018,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(prefetcht2,       "prefetcht2",       IUM_RD, 0x000F1818,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-INST3(sfence,           "sfence",           IUM_RD, 0x000FF8AE,   BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)
-
-// SSE 2 arith
-INST3(addps,            "addps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed singles
-INST3(addss,            "addss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar singles
-INST3(addpd,            "addpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed doubles
-INST3(addsd,            "addsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x58),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add scalar doubles
-INST3(mulps,            "mulps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed singles
-INST3(mulss,            "mulss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar single
-INST3(mulpd,            "mulpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed doubles
-INST3(mulsd,            "mulsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x59),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply scalar doubles
-INST3(subps,            "subps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed singles
-INST3(subss,            "subss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar singles
-INST3(subpd,            "subpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed doubles
-INST3(subsd,            "subsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract scalar doubles
-INST3(minps,            "minps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed singles
-INST3(minss,            "minss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar single
-INST3(minpd,            "minpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum packed doubles
-INST3(minsd,            "minsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Minimum scalar double
-INST3(divps,            "divps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed singles
-INST3(divss,            "divss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar singles
-INST3(divpd,            "divpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide packed doubles
-INST3(divsd,            "divsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5E),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Divide scalar doubles
-INST3(maxps,            "maxps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed singles
-INST3(maxss,            "maxss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar single
-INST3(maxpd,            "maxpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum packed doubles
-INST3(maxsd,            "maxsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5F),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Return Maximum scalar double
-INST3(xorpd,            "xorpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x57),                            INS_Flags_IsDstDstSrcAVXInstruction)    // XOR packed doubles
-INST3(andps,            "andps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed singles
-INST3(andpd,            "andpd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x54),                            INS_Flags_IsDstDstSrcAVXInstruction)    // AND packed doubles
-INST3(sqrtps,           "sqrtps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x51),                            INS_FLAGS_None)    // Sqrt of packed singles
-INST3(sqrtss,           "sqrtss",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar single
-INST3(sqrtpd,           "sqrtpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x51),                            INS_FLAGS_None)    // Sqrt of packed doubles
-INST3(sqrtsd,           "sqrtsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x51),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Sqrt of scalar double
-INST3(andnps,           "andnps",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed singles
-INST3(andnpd,           "andnpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x55),                            INS_Flags_IsDstDstSrcAVXInstruction)    // And-Not packed doubles
-INST3(orps,             "orps",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed singles
-INST3(orpd,             "orpd",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x56),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Or packed doubles
-INST3(haddpd,           "haddpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed doubles
-INST3(haddps,           "haddps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal add packed floats
-INST3(hsubpd,           "hsubpd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed doubles
-INST3(hsubps,           "hsubps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x7D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Horizontal subtract packed floats
-INST3(addsubps,         "addsubps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed singles
-INST3(addsubpd,         "addsubpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add/Subtract packed doubles
-
-// SSE 2 approx arith
-INST3(rcpps,            "rcpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x53),                            INS_FLAGS_None)    // Reciprocal of packed singles
-INST3(rcpss,            "rcpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x53),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal of scalar single
-INST3(rsqrtps,          "rsqrtps",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x52),                            INS_FLAGS_None)    // Reciprocal Sqrt of packed singles
-INST3(rsqrtss,          "rsqrtss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x52),                            INS_Flags_IsDstSrcSrcAVXInstruction)    // Reciprocal Sqrt of scalar single
-
-// SSE2 conversions
-INST3(cvtpi2ps,         "cvtpi2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
-INST3(cvtsi2ss,         "cvtsi2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar single
-INST3(cvtpi2pd,         "cvtpi2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2A),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
-INST3(cvtsi2sd,         "cvtsi2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt DWORD to scalar double
-INST3(cvttps2pi,        "cvttps2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
-INST3(cvttss2si,        "cvttss2si",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2C),                            INS_FLAGS_None)    // cvt with trunc scalar single to DWORD
-INST3(cvttpd2pi,        "cvttpd2pi",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2C),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
-INST3(cvtps2pi,         "cvtps2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x2D),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
-INST3(cvtss2si,         "cvtss2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x2D),                            INS_FLAGS_None)    // cvt scalar single to DWORD
-INST3(cvtpd2pi,         "cvtpd2pi",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x2D),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
-INST3(cvtsd2si,         "cvtsd2si",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x2D),                            INS_FLAGS_None)    // cvt scalar double to DWORD
-INST3(cvtps2pd,         "cvtps2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5A),                            INS_FLAGS_None)    // cvt packed singles to doubles
-INST3(cvtpd2ps,         "cvtpd2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5A),                            INS_FLAGS_None)    // cvt packed doubles to singles
-INST3(cvtss2sd,         "cvtss2sd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar single to scalar doubles
-INST3(cvtsd2ss,         "cvtsd2ss",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x5A),                            INS_Flags_IsDstDstSrcAVXInstruction)    // cvt scalar double to scalar singles
-INST3(cvtdq2ps,         "cvtdq2ps",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0x5B),                            INS_FLAGS_None)    // cvt packed DWORDs to singles
-INST3(cvtps2dq,         "cvtps2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x5B),                            INS_FLAGS_None)    // cvt packed singles to DWORDs
-INST3(cvttps2dq,        "cvttps2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x5B),                            INS_FLAGS_None)    // cvt with trunc packed singles to DWORDs
-INST3(cvtpd2dq,         "cvtpd2dq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xE6),                            INS_FLAGS_None)    // cvt packed doubles to DWORDs
-INST3(cvttpd2dq,        "cvttpd2dq",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE6),                            INS_FLAGS_None)    // cvt with trunc packed doubles to DWORDs
-INST3(cvtdq2pd,         "cvtdq2pd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xE6),                            INS_FLAGS_None)    // cvt packed DWORDs to doubles
-
-// SSE2 comparison instructions
-INST3(comiss,           "comiss",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2F),                            INS_FLAGS_None)    // ordered compare singles
-INST3(comisd,           "comisd",           IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2F),                            INS_FLAGS_None)    // ordered compare doubles
-INST3(ucomiss,          "ucomiss",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKFLT(0x2E),                            INS_FLAGS_None)    // unordered compare singles
-INST3(ucomisd,          "ucomisd",          IUM_RD, BAD_CODE,     BAD_CODE,     PCKDBL(0x2E),                            INS_FLAGS_None)    // unordered compare doubles
-
-// SSE2 packed single/double comparison operations.
-// Note that these instructions not only compare but also overwrite the first source.
-INST3(cmpps,            "cmpps",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed singles
-INST3(cmppd,            "cmppd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare packed doubles
-INST3(cmpss,            "cmpss",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar singles
-INST3(cmpsd,            "cmpsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xC2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // compare scalar doubles
-
-//SSE2 packed integer operations
-INST3(paddb,            "paddb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed byte integers
-INST3(paddw,            "paddw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed word (16-bit) integers
-INST3(paddd,            "paddd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed double-word (32-bit) integers
-INST3(paddq,            "paddq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed quad-word (64-bit) integers
-INST3(paddsb,           "paddsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed byte integers and saturate the results
-INST3(paddsw,           "paddsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xED),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed signed word integers and saturate the results
-INST3(paddusb,          "paddusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDC),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned byte integers and saturate the results
-INST3(paddusw,          "paddusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDD),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Add packed unsigned word integers and saturate the results
-INST3(pavgb,            "pavgb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE0),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed byte integers
-INST3(pavgw,            "pavgw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Average of packed word integers
-INST3(psubb,            "psubb",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
-INST3(psubw,            "psubw",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed word (16-bit) integers
-INST3(psubd,            "psubd",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed double-word (32-bit) integers
-INST3(psubq,            "psubq",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xFB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // subtract packed quad-word (64-bit) integers
-INST3(pmaddwd,          "pmaddwd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst
-INST3(pmulhw,           "pmulhw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit signed integers
-INST3(pmulhuw,          "pmulhuw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply high the packed 16-bit unsigned integers
-INST3(pmuludq,          "pmuludq",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit unsigned integers and store 64-bit result
-INST3(pmullw,           "pmullw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD5),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result
-INST3(pand,             "pand",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND of two xmm regs
-INST3(pandn,            "pandn",            IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise AND NOT of two xmm regs
-INST3(por,              "por",              IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEB),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise OR of two xmm regs
-INST3(pxor,             "pxor",             IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEF),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed bit-wise XOR of two xmm regs
-INST3(psadbw,           "psadbw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xF6),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Compute the sum of absolute differences of packed unsigned 8-bit integers
-INST3(psubsb,           "psubsb",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation
-INST3(psubusb,          "psubusb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD8),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation
-INST3(psubsw,           "psubsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xE9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation
-INST3(psubusw,          "psubusw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xD9),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation
-
-// Note that the shift immediates share the same encoding between left and right-shift, and are distinguished by the Reg/Opcode,
-// which is handled in emitxarch.cpp.
-INST3(psrldq,           "psrldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift right logical of xmm reg by given number of bytes
-INST3(pslldq,           "pslldq",           IUM_WR, BAD_CODE,     PCKDBL(0x73), BAD_CODE,                                INS_Flags_IsDstDstSrcAVXInstruction)    // Shift left logical of xmm reg by given number of bytes
-INST3(psllw,            "psllw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xF1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 16-bit integers
-INST3(pslld,            "pslld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xF2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 32-bit integers
-INST3(psllq,            "psllq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xF3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift left logical of 64-bit integers
-INST3(psrlw,            "psrlw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xD1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 16-bit integers
-INST3(psrld,            "psrld",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xD2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 32-bit integers
-INST3(psrlq,            "psrlq",            IUM_WR, BAD_CODE,     PCKDBL(0x73), PCKDBL(0xD3),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right logical of 64-bit integers
-INST3(psraw,            "psraw",            IUM_WR, BAD_CODE,     PCKDBL(0x71), PCKDBL(0xE1),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 16-bit integers
-INST3(psrad,            "psrad",            IUM_WR, BAD_CODE,     PCKDBL(0x72), PCKDBL(0xE2),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed shift right arithmetic of 32-bit integers
-
-INST3(pmaxub,           "pmaxub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum unsigned bytes
-INST3(pminub,           "pminub",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xDA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum unsigned bytes
-INST3(pmaxsw,           "pmaxsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEE),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed words
-INST3(pminsw,           "pminsw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xEA),                            INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed words
-INST3(pcmpeqd,          "pcmpeqd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x76),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit integers for equality
-INST3(pcmpgtd,          "pcmpgtd",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x66),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 32-bit signed integers for greater than
-INST3(pcmpeqw,          "pcmpeqw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x75),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit integers for equality
-INST3(pcmpgtw,          "pcmpgtw",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x65),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 16-bit signed integers for greater than
-INST3(pcmpeqb,          "pcmpeqb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x74),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit integers for equality
-INST3(pcmpgtb,          "pcmpgtb",          IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x64),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 8-bit signed integers for greater than
-
-INST3(pshufd,           "pshufd",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x70),                            INS_FLAGS_None)    // Packed shuffle of 32-bit integers
-INST3(pshufhw,          "pshufhw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x70),                            INS_FLAGS_None)    // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
-INST3(pshuflw,          "pshuflw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x70),                            INS_FLAGS_None)    // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
-INST3(pextrw,           "pextrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC5),                            INS_FLAGS_None)    // Extract 16-bit value into a r32 with zero extended to 32-bits
-INST3(pinsrw,           "pinsrw",           IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0xC4),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Insert word at index
-
-INST3(punpckhbw,        "punpckhbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x68),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-INST3(punpcklbw,        "punpcklbw",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x60),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (lo)
-INST3(punpckhqdq,       "punpckhqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6D),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (hi)
-INST3(punpcklqdq,       "punpcklqdq",       IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6C),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen uint to ulong (lo)
-INST3(punpckhwd,        "punpckhwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x69),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (hi)
-INST3(punpcklwd,        "punpcklwd",        IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x61),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ushort to uint (lo)
-INST3(unpckhpd,         "unpckhpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x15),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-INST3(unpcklpd,         "unpcklpd",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x14),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Packed logical (unsigned) widen ubyte to ushort (hi)
-
-INST3(packssdw,         "packssdw",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x6B),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to short with saturation
-INST3(packsswb,         "packsswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x63),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to byte with saturation
-INST3(packuswb,         "packuswb",         IUM_WR, BAD_CODE,     BAD_CODE,     PCKDBL(0x67),                            INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) short to unsigned byte with saturation
-
-//    id                nm                  um      mr            mi            rm                                       flags
-INST3(dpps,             "dpps",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two float vector regs
-INST3(dppd,             "dppd",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x41),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed dot product of two double vector regs
-INST3(insertps,         "insertps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x21),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert packed single precision float value
-INST3(pcmpeqq,          "pcmpeqq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x29),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
-INST3(pcmpgtq,          "pcmpgtq",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x37),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed compare 64-bit integers for equality
-INST3(pmulld,           "pmulld",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x40),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
-INST3(ptest,            "ptest",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x17),                             INS_FLAGS_None)    // Packed logical compare
-INST3(phaddd,           "phaddd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add
-INST3(pabsb,            "pabsb",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1C),                             INS_FLAGS_None)    // Packed absolute value of bytes
-INST3(pabsw,            "pabsw",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1D),                             INS_FLAGS_None)    // Packed absolute value of 16-bit integers
-INST3(pabsd,            "pabsd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1E),                             INS_FLAGS_None)    // Packed absolute value of 32-bit integers
-INST3(palignr,          "palignr",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Align Right
-INST3(pmaddubsw,        "pmaddubsw",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x04),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Multiply and Add Packed Signed and Unsigned Bytes
-INST3(pmulhrsw,         "pmulhrsw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Multiply High with Round and Scale
-INST3(pshufb,           "pshufb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x00),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed Shuffle Bytes
-INST3(psignb,           "psignb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x08),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(psignw,           "psignw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x09),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(psignd,           "psignd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed SIGN
-INST3(pminsb,           "pminsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum signed bytes
-INST3(pminsd,           "pminsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x39),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit signed integers
-INST3(pminuw,           "pminuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 16-bit unsigned integers
-INST3(pminud,           "pminud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed minimum 32-bit unsigned integers
-INST3(pmaxsb,           "pmaxsb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum signed bytes
-INST3(pmaxsd,           "pmaxsd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit signed integers
-INST3(pmaxuw,           "pmaxuw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 16-bit unsigned integers
-INST3(pmaxud,           "pmaxud",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x3F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed maximum 32-bit unsigned integers
-INST3(pmovsxbw,         "pmovsxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x20),                             INS_FLAGS_None)    // Packed sign extend byte to short
-INST3(pmovsxbd,         "pmovsxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x21),                             INS_FLAGS_None)    // Packed sign extend byte to int
-INST3(pmovsxbq,         "pmovsxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x22),                             INS_FLAGS_None)    // Packed sign extend byte to long
-INST3(pmovsxwd,         "pmovsxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x23),                             INS_FLAGS_None)    // Packed sign extend short to int
-INST3(pmovsxwq,         "pmovsxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x24),                             INS_FLAGS_None)    // Packed sign extend short to long
-INST3(pmovsxdq,         "pmovsxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x25),                             INS_FLAGS_None)    // Packed sign extend int to long
-INST3(pmovzxbw,         "pmovzxbw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x30),                             INS_FLAGS_None)    // Packed zero extend byte to short
-INST3(pmovzxbd,         "pmovzxbd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x31),                             INS_FLAGS_None)    // Packed zero extend byte to intg
-INST3(pmovzxbq,         "pmovzxbq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x32),                             INS_FLAGS_None)    // Packed zero extend byte to lon
-INST3(pmovzxwd,         "pmovzxwd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x33),                             INS_FLAGS_None)    // Packed zero extend short to int
-INST3(pmovzxwq,         "pmovzxwq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x34),                             INS_FLAGS_None)    // Packed zero extend short to long
-INST3(pmovzxdq,         "pmovzxdq",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x35),                             INS_FLAGS_None)    // Packed zero extend int to long
-INST3(packusdw,         "packusdw",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Pack (narrow) int to unsigned short with saturation
-INST3(roundps,          "roundps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x08),                             INS_FLAGS_None)    // Round packed single precision floating-point values
-INST3(roundss,          "roundss",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0A),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar single precision floating-point values
-INST3(roundpd,          "roundpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x09),                             INS_FLAGS_None)    // Round packed double precision floating-point values
-INST3(roundsd,          "roundsd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0B),                             INS_Flags_IsDstSrcSrcAVXInstruction)    // Round scalar double precision floating-point values
-INST3(pmuldq,           "pmuldq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x28),                             INS_Flags_IsDstDstSrcAVXInstruction)    // packed multiply 32-bit signed integers and store 64-bit result
-INST3(blendps,          "blendps",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Single Precision Floating-Point Values
-INST3(blendvps,         "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x14),                             INS_FLAGS_None)    // Variable Blend Packed Singles
-INST3(blendpd,          "blendpd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Double Precision Floating-Point Values
-INST3(blendvpd,         "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x15),                             INS_FLAGS_None)    // Variable Blend Packed Doubles
-INST3(pblendw,          "pblendw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x0E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed Words
-INST3(pblendvb,         "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x10),                             INS_FLAGS_None)    // Variable Blend Packed Bytes
-INST3(phaddw,           "phaddw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x01),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers
-INST3(phsubw,           "phsubw",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x05),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers
-INST3(phsubd,           "phsubd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 32-bit integers
-INST3(phaddsw,          "phaddsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x03),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal add of 16-bit integers with saturation
-INST3(phsubsw,          "phsubsw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x07),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Packed horizontal subtract of 16-bit integers with saturation
-INST3(lddqu,            "lddqu",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0xF0),                            INS_FLAGS_None)    // Load Unaligned integer
-INST3(movntdqa,         "movntdqa",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x2A),                             INS_FLAGS_None)    // Load Double Quadword Non-Temporal Aligned Hint
-INST3(movddup,          "movddup",          IUM_WR, BAD_CODE,     BAD_CODE,     SSEDBL(0x12),                            INS_FLAGS_None)    // Replicate Double FP Values
-INST3(movsldup,         "movsldup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x12),                            INS_FLAGS_None)    // Replicate even-indexed Single FP Values
-INST3(movshdup,         "movshdup",         IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0x16),                            INS_FLAGS_None)    // Replicate odd-indexed Single FP Values
-INST3(phminposuw,       "phminposuw",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x41),                             INS_FLAGS_None)    // Packed Horizontal Word Minimum
-INST3(mpsadbw,          "mpsadbw",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x42),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Compute Multiple Packed Sums of Absolute Difference
-INST3(pinsrb,           "pinsrb",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x20),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Byte
-INST3(pinsrd,           "pinsrd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Dword
-INST3(pinsrq,           "pinsrq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x22),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert Qword
-INST3(pextrb,           "pextrb",           IUM_WR, SSE3A(0x14),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Byte
-INST3(pextrd,           "pextrd",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Dword
-INST3(pextrq,           "pextrq",           IUM_WR, SSE3A(0x16),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Qword
-INST3(pextrw_sse41,     "pextrw",           IUM_WR, SSE3A(0x15),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Word
-INST3(extractps,        "extractps",        IUM_WR, SSE3A(0x17),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract Packed Floating-Point Values
-
-//PCLMULQDQ instructions
-INST3(pclmulqdq,        "pclmulqdq" ,       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x44),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform a carry-less multiplication of two quadwords
-
-//AES instructions
-INST3(aesdec,           "aesdec",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDE),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES decryption flow
-INST3(aesdeclast,       "aesdeclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDF),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES decryption flow
-INST3(aesenc,           "aesenc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDC),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform one round of an AES encryption flow
-INST3(aesenclast,       "aesenclast",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDD),                             INS_Flags_IsDstDstSrcAVXInstruction)   // Perform last round of an AES encryption flow
-INST3(aesimc,           "aesimc",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xDB),                             INS_FLAGS_None)   // Perform the AES InvMixColumn Transformation
-INST3(aeskeygenassist,  "aeskeygenassist",  IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xDF),                             INS_FLAGS_None)   // AES Round Key Generation Assist
-INST3(LAST_SSE_INSTRUCTION, "LAST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-// AVX only instructions
-INST3(vbroadcastss,     "broadcastss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x18),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
-INST3(vbroadcastsd,     "broadcastsd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x19),                             INS_FLAGS_None)    // Broadcast float value read from memory to entire ymm register
-INST3(vpbroadcastb,     "pbroadcastb",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x78),                             INS_FLAGS_None)    // Broadcast int8 value from reg/memory to entire ymm register
-INST3(vpbroadcastw,     "pbroadcastw",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x79),                             INS_FLAGS_None)    // Broadcast int16 value from reg/memory to entire ymm register
-INST3(vpbroadcastd,     "pbroadcastd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x58),                             INS_FLAGS_None)    // Broadcast int32 value from reg/memory to entire ymm register
-INST3(vpbroadcastq,     "pbroadcastq",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x59),                             INS_FLAGS_None)    // Broadcast int64 value from reg/memory to entire ymm register
-INST3(vextractf128,     "extractf128",      IUM_WR, SSE3A(0x19),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed floating point values
-INST3(vextracti128,     "extracti128",      IUM_WR, SSE3A(0x39),  BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Extract 128-bit packed integer values
-INST3(vinsertf128,      "insertf128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x18),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed floating point values
-INST3(vinserti128,      "inserti128",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x38),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Insert 128-bit packed integer values
-INST3(vzeroupper,       "zeroupper",        IUM_WR, 0xC577F8,     BAD_CODE,     BAD_CODE,                                INS_FLAGS_None)    // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix)
-INST3(vperm2i128,       "perm2i128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute 128-bit halves of input register
-INST3(vpermq,           "permq",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x00),                             INS_FLAGS_None)    // Permute 64-bit of input register
-INST3(vpblendd,         "pblendd",          IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x02),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Blend Packed DWORDs
-INST3(vblendvps,        "blendvps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Singles
-INST3(vblendvpd,        "blendvpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Doubles
-INST3(vpblendvb,        "pblendvb",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x4C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Blend Packed Bytes
-INST3(vtestps,          "testps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0E),                             INS_FLAGS_None)    // Packed Bit Test
-INST3(vtestpd,          "testpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0F),                             INS_FLAGS_None)    // Packed Bit Test
-INST3(vpsrlvd,          "psrlvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
-INST3(vpsrlvq,          "psrlvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x45),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Logical
-INST3(vpsravd,          "psravd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x46),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Right Arithmetic
-INST3(vpsllvd,          "psllvd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
-INST3(vpsllvq,          "psllvq",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x47),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Variable Bit Shift Left Logical
-INST3(vpermilps,        "permilps",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x04),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
-INST3(vpermilpd,        "permilpd",         IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x05),                             INS_FLAGS_None)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
-INST3(vpermilpsvar,     "permilpsvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Single-Precision Floating-Point Values
-INST3(vpermilpdvar,     "permilpdvar",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x0D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values
-INST3(vperm2f128,       "perm2f128",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x06),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Floating-Point Values
-INST3(vpermpd,          "permpd",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0x01),                             INS_FLAGS_None)    // Permute Double-Precision Floating-Point Values
-INST3(vpermd,           "permd",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x36),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Packed Doublewords Elements
-INST3(vpermps,          "permps",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x16),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Permute Single-Precision Floating-Point Elements
-INST3(vbroadcastf128,   "broadcastf128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x1A),                             INS_FLAGS_None)    // Broadcast packed float values read from memory to entire ymm register
-INST3(vbroadcasti128,   "broadcasti128",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x5A),                             INS_FLAGS_None)    // Broadcast packed integer values read from memory to entire ymm register
-INST3(vmaskmovps,       "maskmovps",        IUM_WR, SSE38(0x2E),  BAD_CODE,     SSE38(0x2C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores
-INST3(vmaskmovpd,       "maskmovpd",        IUM_WR, SSE38(0x2F),  BAD_CODE,     SSE38(0x2D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores
-INST3(vpmaskmovd,       "pmaskmovd",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Dword Loads and Stores
-INST3(vpmaskmovq,       "pmaskmovq",        IUM_WR, SSE38(0x8E),  BAD_CODE,     SSE38(0x8C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Conditional SIMD Integer Packed Qword Loads and Stores
-INST3(vpgatherdd,       "pgatherdd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Dword
-INST3(vpgatherqd,       "pgatherqd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword Values Using Signed Qword
-INST3(vpgatherdq,       "pgatherdq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x90),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Dword with Signed Dword Indices
-INST3(vpgatherqq,       "pgatherqq",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x91),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed Qword with Signed Dword Indices
-INST3(vgatherdps,       "gatherdps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Dword Indices
-INST3(vgatherqps,       "gatherqps",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed SP FP values Using Signed Qword Indices
-INST3(vgatherdpd,       "gatherdpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x92),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Dword Indices
-INST3(vgatherqpd,       "gatherqpd",        IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x93),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Gather Packed DP FP Values Using Signed Qword Indices
-
-INST3(FIRST_FMA_INSTRUCTION, "FIRST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-//    id                nm                  um      mr            mi            rm                                       flags
-INST3(vfmadd132pd,      "fmadd132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Double-Precision Floating-Point Values
-INST3(vfmadd213pd,      "fmadd213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231pd,      "fmadd231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132ps,      "fmadd132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x98),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Packed Single-Precision Floating-Point Values
-INST3(vfmadd213ps,      "fmadd213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231ps,      "fmadd231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB8),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132sd,      "fmadd132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Double-Precision Floating-Point Values
-INST3(vfmadd213sd,      "fmadd213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231sd,      "fmadd231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd132ss,      "fmadd132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x99),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Add of Scalar Single-Precision Floating-Point Values
-INST3(vfmadd213ss,      "fmadd213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmadd231ss,      "fmadd231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB9),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub132pd,   "fmaddsub132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfmaddsub213pd,   "fmaddsub213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub231pd,   "fmaddsub231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub132ps,   "fmaddsub132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x96),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Add/Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfmaddsub213ps,   "fmaddsub213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmaddsub231ps,   "fmaddsub231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB6),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd132pd,   "fmsubadd132pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Double-Precision Floating-Point Values
-INST3(vfmsubadd213pd,   "fmsubadd213pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd231pd,   "fmsubadd231pd",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd132ps,   "fmsubadd132ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x97),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Alternating Subtract/Add of Packed Single-Precision Floating-Point Values
-INST3(vfmsubadd213ps,   "fmsubadd213ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xA7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsubadd231ps,   "fmsubadd231ps",    IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xB7),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132pd,      "fmsub132pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfmsub213pd,      "fmsub213pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231pd,      "fmsub231pd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132ps,      "fmsub132ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9A),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfmsub213ps,      "fmsub213ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231ps,      "fmsub231ps",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBA),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132sd,      "fmsub132sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Double-Precision Floating-Point Values
-INST3(vfmsub213sd,      "fmsub213sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231sd,      "fmsub231sd",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub132ss,      "fmsub132ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9B),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Multiply-Subtract of Scalar Single-Precision Floating-Point Values
-INST3(vfmsub213ss,      "fmsub213ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfmsub231ss,      "fmsub231ss",       IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBB),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132pd,     "fnmadd132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Double-Precision Floating-Point Values
-INST3(vfnmadd213pd,     "fnmadd213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231pd,     "fnmadd231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132ps,     "fnmadd132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9C),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Packed Single-Precision Floating-Point Values
-INST3(vfnmadd213ps,     "fnmadd213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231ps,     "fnmadd231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBC),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132sd,     "fnmadd132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Double-Precision Floating-Point Values
-INST3(vfnmadd213sd,     "fnmadd213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231sd,     "fnmadd231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd132ss,     "fnmadd132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9D),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Add of Scalar Single-Precision Floating-Point Values
-INST3(vfnmadd213ss,     "fnmadd213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmadd231ss,     "fnmadd231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBD),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132pd,     "fnmsub132pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Double-Precision Floating-Point Values
-INST3(vfnmsub213pd,     "fnmsub213pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231pd,     "fnmsub231pd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132ps,     "fnmsub132ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9E),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Packed Single-Precision Floating-Point Values
-INST3(vfnmsub213ps,     "fnmsub213ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231ps,     "fnmsub231ps",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBE),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132sd,     "fnmsub132sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Double-Precision Floating-Point Values
-INST3(vfnmsub213sd,     "fnmsub213sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231sd,     "fnmsub231sd",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub132ss,     "fnmsub132ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0x9F),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Fused Negative Multiply-Subtract of Scalar Single-Precision Floating-Point Values
-INST3(vfnmsub213ss,     "fnmsub213ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xAF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(vfnmsub231ss,     "fnmsub231ss",      IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xBF),                             INS_Flags_IsDstDstSrcAVXInstruction)    //
-INST3(LAST_FMA_INSTRUCTION, "LAST_FMA_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-// BMI1
-INST3(FIRST_BMI_INSTRUCTION, "FIRST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-INST3(andn,             "andn",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF2),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Logical AND NOT
-INST3(blsi,             "blsi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Extract Lowest Set Isolated Bit
-INST3(blsmsk,           "blsmsk",           IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Get Mask Up to Lowest Set Bit
-INST3(blsr,             "blsr",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF3),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Reset Lowest Set Bit
-INST3(bextr,            "bextr",            IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF7),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Bit Field Extract
-
-// BMI2
-INST3(rorx,             "rorx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE3A(0xF0),                             INS_FLAGS_None)
-INST3(pdep,             "pdep",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Parallel Bits Deposit
-INST3(pext,             "pext",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Parallel Bits Extract
-INST3(bzhi,             "bzhi",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF5),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Zero High Bits Starting with Specified Bit Position
-INST3(mulx,             "mulx",             IUM_WR, BAD_CODE,     BAD_CODE,     SSE38(0xF6),                             INS_Flags_IsDstDstSrcAVXInstruction)    // Unsigned Multiply Without Affecting Flags
-
-INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
-
-// Scalar instructions in SSE4.2
-INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_FLAGS_None)
-
-// BMI1
-INST3(tzcnt,            "tzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBC),                            INS_FLAGS_None)    // Count the Number of Trailing Zero Bits
-
-// LZCNT
-INST3(lzcnt,            "lzcnt",            IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xBD),                            INS_FLAGS_None)
-
-// POPCNT
-INST3(popcnt,           "popcnt",           IUM_WR, BAD_CODE,     BAD_CODE,     SSEFLT(0xB8),                            INS_FLAGS_None)
-
-//    id                nm                  um      mr            mi                                                     flags
-INST2(ret,              "ret",              IUM_RD, 0x0000C3,     0x0000C2,                                              INS_FLAGS_None)
-INST2(loop,             "loop",             IUM_RD, BAD_CODE,     0x0000E2,                                              INS_FLAGS_None)
-INST2(call,             "call",             IUM_RD, 0x0010FF,     0x0000E8,                                              INS_FLAGS_WritesFlags)
-
-INST2(rol,              "rol",              IUM_RW, 0x0000D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
-INST2(rol_1,            "rol",              IUM_RW, 0x0000D0,     0x0000D0,                                              INS_FLAGS_WritesFlags)
-INST2(rol_N,            "rol",              IUM_RW, 0x0000C0,     0x0000C0,                                              INS_FLAGS_WritesFlags)
-INST2(ror,              "ror",              IUM_RW, 0x0008D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
-INST2(ror_1,            "ror",              IUM_RW, 0x0008D0,     0x0008D0,                                              INS_FLAGS_WritesFlags)
-INST2(ror_N,            "ror",              IUM_RW, 0x0008C0,     0x0008C0,                                              INS_FLAGS_WritesFlags)
-
-INST2(rcl,              "rcl",              IUM_RW, 0x0010D2,     BAD_CODE,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(rcl_1,            "rcl",              IUM_RW, 0x0010D0,     0x0010D0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(rcl_N,            "rcl",              IUM_RW, 0x0010C0,     0x0010C0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(rcr,              "rcr",              IUM_RW, 0x0018D2,     BAD_CODE,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(rcr_1,            "rcr",              IUM_RW, 0x0018D0,     0x0018D0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(rcr_N,            "rcr",              IUM_RW, 0x0018C0,     0x0018C0,                                              INS_FLAGS_ReadsFlags | INS_FLAGS_WritesFlags)
-INST2(shl,              "shl",              IUM_RW, 0x0020D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
-INST2(shl_1,            "shl",              IUM_RW, 0x0020D0,     0x0020D0,                                              INS_FLAGS_WritesFlags)
-INST2(shl_N,            "shl",              IUM_RW, 0x0020C0,     0x0020C0,                                              INS_FLAGS_WritesFlags)
-INST2(shr,              "shr",              IUM_RW, 0x0028D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
-INST2(shr_1,            "shr",              IUM_RW, 0x0028D0,     0x0028D0,                                              INS_FLAGS_WritesFlags)
-INST2(shr_N,            "shr",              IUM_RW, 0x0028C0,     0x0028C0,                                              INS_FLAGS_WritesFlags)
-INST2(sar,              "sar",              IUM_RW, 0x0038D2,     BAD_CODE,                                              INS_FLAGS_WritesFlags)
-INST2(sar_1,            "sar",              IUM_RW, 0x0038D0,     0x0038D0,                                              INS_FLAGS_WritesFlags)
-INST2(sar_N,            "sar",              IUM_RW, 0x0038C0,     0x0038C0,                                              INS_FLAGS_WritesFlags)
-
-
-//    id                nm                  um      mr                                                                   flags
-INST1(r_movsb,          "rep movsb",        IUM_RD, 0x00A4F3,                                                            INS_FLAGS_None)
-INST1(r_movsd,          "rep movsd",        IUM_RD, 0x00A5F3,                                                            INS_FLAGS_None)
-#if defined(TARGET_AMD64)
-INST1(r_movsq,          "rep movsq",        IUM_RD, 0xF3A548,                                                            INS_FLAGS_None)
-#endif // defined(TARGET_AMD64)
-INST1(movsb,            "movsb",            IUM_RD, 0x0000A4,                                                            INS_FLAGS_None)
-INST1(movsd,            "movsd",            IUM_RD, 0x0000A5,                                                            INS_FLAGS_None)
-#if defined(TARGET_AMD64)
-INST1(movsq,            "movsq",            IUM_RD, 0x00A548,                                                            INS_FLAGS_None)
-#endif // defined(TARGET_AMD64)
-
-INST1(r_stosb,          "rep stosb",        IUM_RD, 0x00AAF3,                                                            INS_FLAGS_None)
-INST1(r_stosd,          "rep stosd",        IUM_RD, 0x00ABF3,                                                            INS_FLAGS_None)
-#if defined(TARGET_AMD64)
-INST1(r_stosq,          "rep stosq",        IUM_RD, 0xF3AB48,                                                            INS_FLAGS_None)
-#endif // defined(TARGET_AMD64)
-INST1(stosb,            "stosb",            IUM_RD, 0x0000AA,                                                            INS_FLAGS_None)
-INST1(stosd,            "stosd",            IUM_RD, 0x0000AB,                                                            INS_FLAGS_None)
-#if defined(TARGET_AMD64)
-INST1(stosq,            "stosq",            IUM_RD, 0x00AB48,                                                            INS_FLAGS_None)
-#endif // defined(TARGET_AMD64)
-
-INST1(int3,             "int3",             IUM_RD, 0x0000CC,                                                            INS_FLAGS_None)
-INST1(nop,              "nop",              IUM_RD, 0x000090,                                                            INS_FLAGS_None)
-INST1(lock,             "lock",             IUM_RD, 0x0000F0,                                                            INS_FLAGS_None)
-INST1(leave,            "leave",            IUM_RD, 0x0000C9,                                                            INS_FLAGS_None)
-
-
-INST1(neg,              "neg",              IUM_RW, 0x0018F6,                                                            INS_FLAGS_WritesFlags)
-INST1(not,              "not",              IUM_RW, 0x0010F6,                                                            INS_FLAGS_WritesFlags)
-
-INST1(cdq,              "cdq",              IUM_RD, 0x000099,                                                            INS_FLAGS_WritesFlags)
-INST1(idiv,             "idiv",             IUM_RD, 0x0038F6,                                                            INS_FLAGS_WritesFlags)
-INST1(imulEAX,          "imul",             IUM_RD, 0x0028F6,                                                            INS_FLAGS_WritesFlags) // edx:eax = eax*op1
-INST1(div,              "div",              IUM_RD, 0x0030F6,                                                            INS_FLAGS_WritesFlags)
-INST1(mulEAX,           "mul",              IUM_RD, 0x0020F6,                                                            INS_FLAGS_WritesFlags)
-
-INST1(sahf,             "sahf",             IUM_RD, 0x00009E,                                                            INS_FLAGS_WritesFlags)
-
-INST1(xadd,             "xadd",             IUM_RW, 0x0F00C0,                                                            INS_FLAGS_WritesFlags)
-INST1(cmpxchg,          "cmpxchg",          IUM_RW, 0x0F00B0,                                                            INS_FLAGS_WritesFlags)
-
-INST1(shld,             "shld",             IUM_RW, 0x0F00A4,                                                            INS_FLAGS_WritesFlags)
-INST1(shrd,             "shrd",             IUM_RW, 0x0F00AC,                                                            INS_FLAGS_WritesFlags)
-
-// For RyuJIT/x86, we follow the x86 calling convention that requires
-// us to return floating point value on the x87 FP stack, so we need
-// these instructions regardless of whether we're using full stack fp.
-#ifdef TARGET_X86
-INST1(fld,              "fld",              IUM_WR, 0x0000D9,                                                            INS_FLAGS_x87Instr)
-INST1(fstp,             "fstp",             IUM_WR, 0x0018D9,                                                            INS_FLAGS_x87Instr)
-#endif // TARGET_X86
-
-INST1(seto,             "seto",             IUM_WR, 0x0F0090,                                                            INS_FLAGS_ReadsFlags)
-INST1(setno,            "setno",            IUM_WR, 0x0F0091,                                                            INS_FLAGS_ReadsFlags)
-INST1(setb,             "setb",             IUM_WR, 0x0F0092,                                                            INS_FLAGS_ReadsFlags)
-INST1(setae,            "setae",            IUM_WR, 0x0F0093,                                                            INS_FLAGS_ReadsFlags)
-INST1(sete,             "sete",             IUM_WR, 0x0F0094,                                                            INS_FLAGS_ReadsFlags)
-INST1(setne,            "setne",            IUM_WR, 0x0F0095,                                                            INS_FLAGS_ReadsFlags)
-INST1(setbe,            "setbe",            IUM_WR, 0x0F0096,                                                            INS_FLAGS_ReadsFlags)
-INST1(seta,             "seta",             IUM_WR, 0x0F0097,                                                            INS_FLAGS_ReadsFlags)
-INST1(sets,             "sets",             IUM_WR, 0x0F0098,                                                            INS_FLAGS_ReadsFlags)
-INST1(setns,            "setns",            IUM_WR, 0x0F0099,                                                            INS_FLAGS_ReadsFlags)
-INST1(setp,             "setp",             IUM_WR, 0x0F009A,                                                            INS_FLAGS_ReadsFlags)
-INST1(setnp,            "setnp",            IUM_WR, 0x0F009B,                                                            INS_FLAGS_ReadsFlags)
-INST1(setl,             "setl",             IUM_WR, 0x0F009C,                                                            INS_FLAGS_ReadsFlags)
-INST1(setge,            "setge",            IUM_WR, 0x0F009D,                                                            INS_FLAGS_ReadsFlags)
-INST1(setle,            "setle",            IUM_WR, 0x0F009E,                                                            INS_FLAGS_ReadsFlags)
-INST1(setg,             "setg",             IUM_WR, 0x0F009F,                                                            INS_FLAGS_ReadsFlags)
-
-#ifdef TARGET_AMD64
-// A jump with rex prefix. This is used for register indirect
-// tail calls.
-INST1(rex_jmp,          "rex.jmp",          IUM_RD, 0x0020FE,                                                            INS_FLAGS_None)
-#endif
-
-INST1(i_jmp,            "jmp",              IUM_RD, 0x0020FE,                                                            INS_FLAGS_None)
-
-INST0(jmp,              "jmp",              IUM_RD, 0x0000EB,                                                            INS_FLAGS_None)
-INST0(jo,               "jo",               IUM_RD, 0x000070,                                                            INS_FLAGS_ReadsFlags)
-INST0(jno,              "jno",              IUM_RD, 0x000071,                                                            INS_FLAGS_ReadsFlags)
-INST0(jb,               "jb",               IUM_RD, 0x000072,                                                            INS_FLAGS_ReadsFlags)
-INST0(jae,              "jae",              IUM_RD, 0x000073,                                                            INS_FLAGS_ReadsFlags)
-INST0(je,               "je",               IUM_RD, 0x000074,                                                            INS_FLAGS_ReadsFlags)
-INST0(jne,              "jne",              IUM_RD, 0x000075,                                                            INS_FLAGS_ReadsFlags)
-INST0(jbe,              "jbe",              IUM_RD, 0x000076,                                                            INS_FLAGS_ReadsFlags)
-INST0(ja,               "ja",               IUM_RD, 0x000077,                                                            INS_FLAGS_ReadsFlags)
-INST0(js,               "js",               IUM_RD, 0x000078,                                                            INS_FLAGS_ReadsFlags)
-INST0(jns,              "jns",              IUM_RD, 0x000079,                                                            INS_FLAGS_ReadsFlags)
-INST0(jp,               "jp",               IUM_RD, 0x00007A,                                                            INS_FLAGS_ReadsFlags)
-INST0(jnp,              "jnp",              IUM_RD, 0x00007B,                                                            INS_FLAGS_ReadsFlags)
-INST0(jl,               "jl",               IUM_RD, 0x00007C,                                                            INS_FLAGS_ReadsFlags)
-INST0(jge,              "jge",              IUM_RD, 0x00007D,                                                            INS_FLAGS_ReadsFlags)
-INST0(jle,              "jle",              IUM_RD, 0x00007E,                                                            INS_FLAGS_ReadsFlags)
-INST0(jg,               "jg",               IUM_RD, 0x00007F,                                                            INS_FLAGS_ReadsFlags)
-
-INST0(l_jmp,            "jmp",              IUM_RD, 0x0000E9,                                                            INS_FLAGS_None)
-INST0(l_jo,             "jo",               IUM_RD, 0x00800F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jno,            "jno",              IUM_RD, 0x00810F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jb,             "jb",               IUM_RD, 0x00820F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jae,            "jae",              IUM_RD, 0x00830F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_je,             "je",               IUM_RD, 0x00840F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jne,            "jne",              IUM_RD, 0x00850F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jbe,            "jbe",              IUM_RD, 0x00860F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_ja,             "ja",               IUM_RD, 0x00870F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_js,             "js",               IUM_RD, 0x00880F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jns,            "jns",              IUM_RD, 0x00890F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jp,             "jp",               IUM_RD, 0x008A0F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jnp,            "jnp",              IUM_RD, 0x008B0F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jl,             "jl",               IUM_RD, 0x008C0F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jge,            "jge",              IUM_RD, 0x008D0F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jle,            "jle",              IUM_RD, 0x008E0F,                                                            INS_FLAGS_ReadsFlags)
-INST0(l_jg,             "jg",               IUM_RD, 0x008F0F,                                                            INS_FLAGS_ReadsFlags)
-
-INST0(align,            "align",            IUM_RD, BAD_CODE,                                                            INS_FLAGS_None)
-
-/*****************************************************************************/
-#undef  INST0
-#undef  INST1
-#undef  INST2
-#undef  INST3
-#undef  INST4
-#undef  INST5
-/*****************************************************************************/
-
-// clang-format on
diff --git a/src/coreclr/jit/jitgcinfo.h b/src/coreclr/jit/jitgcinfo.h
index 83393bda9bbe..1274d9a2a162 100644
--- a/src/coreclr/jit/jitgcinfo.h
+++ b/src/coreclr/jit/jitgcinfo.h
@@ -170,10 +170,12 @@ class GCInfo
         }
 
         unsigned short rpdGCtype : 2; // is this a pointer, after all?
+#ifndef TARGET_WASM
         GCtype         rpdGCtypeGet()
         {
             return (GCtype)rpdGCtype;
         }
+#endif // !TARGET_WASM
 
         unsigned short rpdIsThis : 1;                       // is it the 'this' pointer
         unsigned short rpdCall : 1;                         // is this a true call site?
diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
index 60de4a035742..ac5318764c0e 100644
--- a/src/coreclr/jit/lsra.h
+++ b/src/coreclr/jit/lsra.h
@@ -735,7 +735,7 @@ class LinearScan : public LinearScanInterface
     // Hence the "SmallFPSet" has 5 elements.
     CLANG_FORMAT_COMMENT_ANCHOR;
 
-#if defined(TARGET_AMD64) || defined(TARGET_WASM)
+#if defined(TARGET_AMD64)
 #ifdef UNIX_AMD64_ABI
     // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
     static const regMaskTP LsraLimitSmallIntSet =
@@ -757,17 +757,6 @@ class LinearScan : public LinearScanInterface
 #elif defined(TARGET_X86)
     static const regMaskTP LsraLimitSmallIntSet = (RBM_EAX | RBM_ECX | RBM_EDI);
     static const regMaskTP LsraLimitSmallFPSet  = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7);
-#elif defined(TARGET_WASM)
-#ifdef UNIX_AMD64_ABI
-    // On System V the RDI and RSI are not callee saved. Use R12 ans R13 as callee saved registers.
-    static const regMaskTP LsraLimitSmallIntSet =
-        (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_R12 | RBM_R13);
-#else  // !UNIX_AMD64_ABI
-    // On Windows Amd64 use the RDI and RSI as callee saved registers.
-    static const regMaskTP LsraLimitSmallIntSet =
-        (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI);
-#endif // !UNIX_AMD64_ABI
-    static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7);
 #else
 #error Unsupported or unset target architecture
 #endif // target
@@ -1733,7 +1722,7 @@ class LinearScan : public LinearScanInterface
 
     void setDelayFree(RefPosition* use);
     int BuildBinaryUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
-#if defined(TARGET_XARCH) || defined(TARGET_WASM)
+#ifdef TARGET_XARCH
     int BuildRMWUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
 #endif // !TARGET_XARCH
     // This is the main entry point for building the RefPositions for a node.
@@ -1754,7 +1743,7 @@ class LinearScan : public LinearScanInterface
     void BuildDefsWithKills(GenTree* tree, int dstCount, regMaskTP dstCandidates, regMaskTP killMask);
 
     int BuildReturn(GenTree* tree);
-#if defined(TARGET_XARCH) || defined(TARGET_WASM)
+#ifdef TARGET_XARCH
     // This method, unlike the others, returns the number of sources, since it may be called when
     // 'tree' is contained.
     int BuildShiftRotate(GenTree* tree);
@@ -1775,7 +1764,7 @@ class LinearScan : public LinearScanInterface
     int BuildGCWriteBarrier(GenTree* tree);
     int BuildCast(GenTreeCast* cast);
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM)
+#ifdef TARGET_XARCH
     // returns true if the tree can use the read-modify-write memory instruction form
     bool isRMWRegOper(GenTree* tree);
     int BuildMul(GenTree* tree);
diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp
index cae8de063bc9..1746cf492be6 100644
--- a/src/coreclr/jit/stacklevelsetter.cpp
+++ b/src/coreclr/jit/stacklevelsetter.cpp
@@ -1,6 +1,8 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#ifndef TARGET_WASM
+
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -19,12 +21,8 @@ StackLevelSetter::StackLevelSetter(Compiler* compiler)
     , throwHelperBlocksUsed(comp->fgUseThrowHelperBlocks() && comp->compUsesThrowHelper)
 #endif // !FEATURE_FIXED_OUT_ARGS
 {
-#ifndef TARGET_WASM
     // The constructor reads this value to skip iterations that could set it if it is already set.
     compiler->codeGen->resetWritePhaseForFramePointerRequired();
-#else
-    assert(false); // Wasm - TODO can this be ignored?
-#endif // !TARGET_WASM
 }
 
 //------------------------------------------------------------------------
@@ -338,11 +336,7 @@ void StackLevelSetter::CheckArgCnt()
             printf("Too many pushed arguments for an ESP based encoding, forcing an EBP frame\n");
         }
 #endif
-#ifndef TARGET_WASM
         comp->codeGen->setFramePointerRequired(true);
-#else
-        assert(false); // Wasm - TODO can this be ignored?
-#endif // !TARGET_WASM
     }
 }
 
@@ -365,3 +359,4 @@ void StackLevelSetter::CheckAdditionalArgs()
     }
 #endif // TARGET_X86
 }
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/stacklevelsetter.h b/src/coreclr/jit/stacklevelsetter.h
index f43558f09769..48208cda198a 100644
--- a/src/coreclr/jit/stacklevelsetter.h
+++ b/src/coreclr/jit/stacklevelsetter.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#ifndef TARGET_WASM
+
 #include "compiler.h"
 #include "phase.h"
 
@@ -42,3 +44,5 @@ class StackLevelSetter final : public Phase
     bool throwHelperBlocksUsed; // Were any throw helper blocks created for this method.
 #endif                          // !FEATURE_FIXED_OUT_ARGS
 };
+
+#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h
index 5f129106fcaf..b61a630a85f1 100644
--- a/src/coreclr/jit/typelist.h
+++ b/src/coreclr/jit/typelist.h
@@ -3,7 +3,11 @@
 
 #define GCS EA_GCREF
 #define BRS EA_BYREF
+#if defined(TARGET_WASM)
+#define PS TARGET_POINTER_SIZE
+#else
 #define PS EA_PTRSIZE
+#endif
 #define PST (TARGET_POINTER_SIZE / sizeof(int))
 
 #ifdef TARGET_64BIT
diff --git a/src/coreclr/tools/aot/ilc.sln b/src/coreclr/tools/aot/ilc.sln
index 5c12affa7263..4f8dc84b9684 100644
--- a/src/coreclr/tools/aot/ilc.sln
+++ b/src/coreclr/tools/aot/ilc.sln
@@ -22,6 +22,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_browser_wasm32_x64",
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x64_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x64_x64.vcxproj", "{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x86_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x86_x64.vcxproj", "{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Checked|Any CPU = Checked|Any CPU
@@ -238,6 +240,22 @@ Global
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|Any CPU.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.Build.0 = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x86.ActiveCfg = Checked|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.Build.0 = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x86.ActiveCfg = Debug|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|Any CPU.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.Build.0 = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x86.ActiveCfg = Release|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From d2075e7966c425dc4d7838bc8d4cc682a28fce0a Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 8 Feb 2021 15:39:28 -0500
Subject: [PATCH 11/44] fix ifdef nesting for release config

---
 src/coreclr/jit/ee_il_dll.cpp |  2 +-
 src/coreclr/tools/aot/ilc.sln | 18 ------------------
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp
index fdee523d5bba..8cb55e247b20 100644
--- a/src/coreclr/jit/ee_il_dll.cpp
+++ b/src/coreclr/jit/ee_il_dll.cpp
@@ -761,8 +761,8 @@ void Compiler::eeGetVars()
 #endif // DEBUG
 }
 
-#ifndef TARGET_WASM
 #ifdef DEBUG
+#ifndef TARGET_WASM
 void Compiler::eeDispVar(ICorDebugInfo::NativeVarInfo* var)
 {
     const char* name = nullptr;
diff --git a/src/coreclr/tools/aot/ilc.sln b/src/coreclr/tools/aot/ilc.sln
index 4f8dc84b9684..5c12affa7263 100644
--- a/src/coreclr/tools/aot/ilc.sln
+++ b/src/coreclr/tools/aot/ilc.sln
@@ -22,8 +22,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_browser_wasm32_x64",
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x64_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x64_x64.vcxproj", "{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clrjit_win_x86_x64", "..\..\..\..\artifacts\obj\coreclr\windows.x64.Debug\jit\clrjit_win_x86_x64.vcxproj", "{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}"
-EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Checked|Any CPU = Checked|Any CPU
@@ -240,22 +238,6 @@ Global
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
 		{A75E7596-C53A-3C6F-8FD7-AC56E41F3783}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|Any CPU.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x64.Build.0 = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Checked|x86.ActiveCfg = Checked|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|Any CPU.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x64.Build.0 = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Debug|x86.ActiveCfg = Debug|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|Any CPU.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x64.Build.0 = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.Release|x86.ActiveCfg = Release|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{AD5CAC34-CF44-34E0-AF16-7725F5B2AF94}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

From ffe32da51af8eee798c2ac88f14f5bf3d9d34417 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 8 Feb 2021 16:36:35 -0500
Subject: [PATCH 12/44] fix more endif syntax for nix

---
 src/coreclr/jit/gcinfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/gcinfo.cpp b/src/coreclr/jit/gcinfo.cpp
index 81ffa1115d4e..13ed5a615ba0 100644
--- a/src/coreclr/jit/gcinfo.cpp
+++ b/src/coreclr/jit/gcinfo.cpp
@@ -221,7 +221,7 @@ void GCInfo::gcMarkRegPtrVal(regNumber reg, var_types type)
             break;
     }
 }
-#endif !TARGET_WASM
+#endif // !TARGET_WASM
 
 /*****************************************************************************/
 

From 9232ad1c47e76326ea9efa612fcb49ee505504f7 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 13 Feb 2021 10:55:46 -0500
Subject: [PATCH 13/44] Dont include eeDispVars for wasm

---
 src/coreclr/jit/compiler.h    | 2 +-
 src/coreclr/jit/ee_il_dll.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index a3ab9051b311..383f72e2f4d4 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -7627,12 +7627,12 @@ class Compiler
                      unsigned                          varNum,
                      const CodeGenInterface::siVarLoc& loc);
     void eeSetLVdone();
-#endif
 
 #ifdef DEBUG
     void eeDispVar(ICorDebugInfo::NativeVarInfo* var);
     void eeDispVars(CORINFO_METHOD_HANDLE ftn, ULONG32 cVars, ICorDebugInfo::NativeVarInfo* vars);
 #endif // DEBUG
+#endif
 
     // ICorJitInfo wrappers
 
diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp
index 8cb55e247b20..b71df6aadead 100644
--- a/src/coreclr/jit/ee_il_dll.cpp
+++ b/src/coreclr/jit/ee_il_dll.cpp
@@ -857,7 +857,6 @@ void Compiler::eeDispVar(ICorDebugInfo::NativeVarInfo* var)
 
     printf("\n");
 }
-#endif // !TARGET_WASM
 
 // Same parameters as ICorStaticInfo::setVars().
 void Compiler::eeDispVars(CORINFO_METHOD_HANDLE ftn, ULONG32 cVars, ICorDebugInfo::NativeVarInfo* vars)
@@ -879,6 +878,7 @@ void Compiler::eeDispVars(CORINFO_METHOD_HANDLE ftn, ULONG32 cVars, ICorDebugInf
         eeDispVar(&vars[i]);
     }
 }
+#endif // !TARGET_WASM
 #endif // DEBUG
 
 /*****************************************************************************

From e549b87039dc59c0dd498d62c70c17c2515cdf56 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 13 Feb 2021 13:22:44 -0500
Subject: [PATCH 14/44] exclude wasm jit from unix builds - wasm doesn't build
 there at the moment

attempt to add a second platform to the wasm yaml build
fix problem where Array<T>.Resize has its code compiled twice.
---
 eng/pipelines/runtimelab.yml                      |  1 +
 src/coreclr/jit/CMakeLists.txt                    |  8 +++++---
 .../ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs   |  5 ++++-
 .../Compiler/LLVMCodegenCompilation.cs            | 15 ++++++++-------
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/eng/pipelines/runtimelab.yml b/eng/pipelines/runtimelab.yml
index 4dd3e2d9ee2a..26ff64a54677 100644
--- a/eng/pipelines/runtimelab.yml
+++ b/eng/pipelines/runtimelab.yml
@@ -79,6 +79,7 @@ stages:
         runtimeFlavor: coreclr
         platforms:
         - Browser_wasm
+        - windows_x64
         jobParameters:
           timeoutInMinutes: 90
           testGroup: innerloop
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 7fe9599e385b..89278bccc441 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -542,9 +542,11 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
-  # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
-  #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+  if (NOT CLR_CMAKE_HOST_UNIX)
+    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+  endif (NOT CLR_CMAKE_HOST_UNIX)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
index 71da8e073844..c67719f3e3c4 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
@@ -834,8 +834,11 @@ LLVMMetadataRef CreateDebugFunctionAndDiLocation(DebugMetadata debugMetadata, IL
             if (_debugFunction.Handle == IntPtr.Zero)
             {
                 LLVMMetadataRef functionMetaType = _compilation.DIBuilder.CreateSubroutineType(debugMetadata.File,
-                    ReadOnlySpan<LLVMMetadataRef>.Empty, LLVMDIFlags.LLVMDIFlagZero);
+                    ReadOnlySpan<LLVMMetadataRef>.Empty /* TODO */, LLVMDIFlags.LLVMDIFlagZero);
+                if (_method.Name == "Resize")
+                {
 
+                }
                 uint lineNumber = (uint) _debugInformation.GetSequencePoints().FirstOrDefault().LineNumber;
                 _debugFunction = _compilation.DIBuilder.CreateFunction(debugMetadata.File, _method.Name, _method.Name,
                     debugMetadata.File,
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index d8205ad81590..1dfd1edc2883 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -62,7 +62,7 @@ protected override void CompileInternal(string outputFile, ObjectDumper dumper)
 
             var nodes = _dependencyGraph.MarkedNodeList;
 
-            Console.WriteLine($"RyuJIT compilation results, total method {totalMethodCount} RyuJit Methods {ryuJitMethodCount} % {ryuJitMethodCount * 100 / totalMethodCount}");
+            Console.WriteLine($"RyuJIT compilation results, total methods {totalMethodCount} RyuJit Methods {ryuJitMethodCount} % {((decimal)ryuJitMethodCount * 100 / totalMethodCount):n4}");
             LLVMObjectWriter.EmitObject(outputFile, nodes, NodeFactory, this, dumper);
         }
 
@@ -86,8 +86,6 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
                     continue;
 
                 methodsToCompile.Add(methodCodeNodeNeedingCode);
-
-//                ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
             }
             CompileSingleThreaded(methodsToCompile);
         }
@@ -98,6 +96,9 @@ private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
 
             foreach (LLVMMethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
             {
+                if (methodCodeNodeNeedingCode.StaticDependenciesAreComputed)
+                    continue;
+
                 if (Logger.IsVerbose)
                 {
                     Logger.Writer.WriteLine($"Compiling {methodCodeNodeNeedingCode.Method}...");
@@ -115,11 +116,11 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
 
             try
             {
-                corInfo.CompileMethod(methodCodeNodeNeedingCode);
+            //     corInfo.CompileMethod(methodCodeNodeNeedingCode);
                 ryuJitMethodCount++;
-            }
-            catch (CodeGenerationFailedException)
-            {
+            // }
+            // catch (CodeGenerationFailedException)
+            // {
                 ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
             }
             catch (TypeSystemException ex)

From 504c2eca985f1aadc8e8d7412fabbd262a4ec446 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 14 Feb 2021 10:38:36 -0500
Subject: [PATCH 15/44] extende jitinterface to get the LLVM module

---
 .../superpmi-shim-counter/icorjitinfo.cpp     |   6 +
 .../superpmi-shim-simple/icorjitinfo.cpp      |   5 +
 src/coreclr/inc/corinfo.h                     |   4 +
 src/coreclr/inc/icorjitinfoimpl_generated.h   |   2 +
 src/coreclr/inc/jiteeversionguid.h            |  10 +-
 src/coreclr/jit/ICorJitInfo_API_names.h       |   1 +
 src/coreclr/jit/ICorJitInfo_API_wrapper.hpp   |   8 ++
 src/coreclr/jit/compiler.cpp                  |   4 +-
 .../tools/Common/JitInterface/CorInfoBase.cs  | 128 ++++++++++--------
 .../tools/Common/JitInterface/CorInfoImpl.cs  |   5 +
 .../ThunkGenerator/ThunkInput.txt             |   1 +
 .../Compiler/LLVMCodegenCompilation.cs        |  10 +-
 .../JitInterface/CorInfoImpl.RyuJit.cs        |   8 ++
 .../tools/aot/jitinterface/jitinterface.h     |   9 ++
 14 files changed, 134 insertions(+), 67 deletions(-)

diff --git a/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
index 6945b6d74d38..5c8af152b618 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
@@ -937,6 +937,12 @@ bool interceptor_ICJI::getSystemVAmd64PassStructInRegisterDescriptor(
     return original_ICorJitInfo->getSystemVAmd64PassStructInRegisterDescriptor(structHnd, structPassInRegDescPtr);
 }
 
+void* interceptor_ICJI::getLlvmModule()
+{
+    mcs->AddCall("getLlvmModule");
+    return original_ICorJitInfo->getLlvmModule();
+}
+
 DWORD interceptor_ICJI::getThreadTLSIndex(
           void** ppIndirection)
 {
diff --git a/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
index 1d73a9759c60..a5876d454593 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
@@ -820,6 +820,11 @@ bool interceptor_ICJI::getSystemVAmd64PassStructInRegisterDescriptor(
     return original_ICorJitInfo->getSystemVAmd64PassStructInRegisterDescriptor(structHnd, structPassInRegDescPtr);
 }
 
+void* interceptor_ICJI::getLlvmModule()
+{
+    return original_ICorJitInfo->getLlvmModule();
+}
+
 DWORD interceptor_ICJI::getThreadTLSIndex(
           void** ppIndirection)
 {
diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h
index bdaae65d65ff..7ca0d1a2d52d 100644
--- a/src/coreclr/inc/corinfo.h
+++ b/src/coreclr/inc/corinfo.h
@@ -2827,6 +2827,10 @@ class ICorStaticInfo
         /* OUT */   SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr
         ) = 0;
 
+    // return the LLVM Module.  This is static as there is only one module.
+    virtual void* getLlvmModule(
+    ) = 0;
+
 };
 
 /*****************************************************************************
diff --git a/src/coreclr/inc/icorjitinfoimpl_generated.h b/src/coreclr/inc/icorjitinfoimpl_generated.h
index 20219a094d65..b657cbe15081 100644
--- a/src/coreclr/inc/icorjitinfoimpl_generated.h
+++ b/src/coreclr/inc/icorjitinfoimpl_generated.h
@@ -478,6 +478,8 @@ bool getSystemVAmd64PassStructInRegisterDescriptor(
           CORINFO_CLASS_HANDLE structHnd,
           SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr) override;
 
+void* getLlvmModule() override;
+
 DWORD getThreadTLSIndex(
           void** ppIndirection) override;
 
diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h
index 69d85eda196d..b784e539abad 100644
--- a/src/coreclr/inc/jiteeversionguid.h
+++ b/src/coreclr/inc/jiteeversionguid.h
@@ -32,11 +32,11 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 
-constexpr GUID JITEEVersionIdentifier = { /* 000b3acb-92d2-4003-8760-e545241dd9a8 */
-    0x000b3acb,
-    0x92d2,
-    0x4003,
-    {0x87, 0x60, 0xe5, 0x45, 0x24, 0x1d, 0xd9, 0xa8}
+constexpr GUID JITEEVersionIdentifier = { /* c766b7f8-7671-4eef-9d5d-c2242e850c26 */
+    0xc766b7f8,
+    0x7671,
+    0x4eef,
+    {0x9d, 0x5d, 0xc2, 0x24, 0x2e, 0x85, 0x0c, 0x26}
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/coreclr/jit/ICorJitInfo_API_names.h b/src/coreclr/jit/ICorJitInfo_API_names.h
index 1c74f8b96aed..535b455398b1 100644
--- a/src/coreclr/jit/ICorJitInfo_API_names.h
+++ b/src/coreclr/jit/ICorJitInfo_API_names.h
@@ -121,6 +121,7 @@ DEF_CLR_API(getMethodNameFromMetadata)
 DEF_CLR_API(getMethodHash)
 DEF_CLR_API(findNameOfToken)
 DEF_CLR_API(getSystemVAmd64PassStructInRegisterDescriptor)
+DEF_CLR_API(getLlvmModule)
 DEF_CLR_API(getThreadTLSIndex)
 DEF_CLR_API(getInlinedCallFrameVptr)
 DEF_CLR_API(getAddrOfCaptureThreadGlobal)
diff --git a/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp b/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
index 42d2b55b7303..1ea49f75710f 100644
--- a/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
+++ b/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
@@ -1145,6 +1145,14 @@ bool WrapICorJitInfo::getSystemVAmd64PassStructInRegisterDescriptor(
     return temp;
 }
 
+void* WrapICorJitInfo::getLlvmModule()
+{
+    API_ENTER(getLlvmModule);
+    void* temp = wrapHnd->getLlvmModule();
+    API_LEAVE(getLlvmModule);
+    return temp;
+}
+
 DWORD WrapICorJitInfo::getThreadTLSIndex(
           void** ppIndirection)
 {
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 3b70be9b9097..4c5ca7bbaf73 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4441,8 +4441,10 @@ void Compiler::EndPhase(Phases phase)
 }
 
 #if defined(TARGET_WASM)
-inline void DoLlvmPhase(Compiler* _compiler)
+inline void DoLlvmPhase(Compiler* pCompiler)
 {
+    void* llvmModule;
+    llvmModule = pCompiler->info.compCompHnd->getLlvmModule();
     fatal(CORJIT_SKIPPED);
     //assert(false);
 }
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
index 582ec65e0f17..e9b8d34277b4 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
@@ -10,7 +10,7 @@
 
 namespace Internal.JitInterface
 {
-    public unsafe partial class CorInfoImpl
+    unsafe partial class CorInfoImpl
     {
         [UnmanagedCallersOnly]
         static uint _getMethodAttribs(IntPtr thisHandle, IntPtr* ppException, CORINFO_METHOD_STRUCT_* ftn)
@@ -1741,6 +1741,21 @@ static byte _getSystemVAmd64PassStructInRegisterDescriptor(IntPtr thisHandle, In
             }
         }
 
+        [UnmanagedCallersOnly]
+        static void* _getLlvmModule(IntPtr thisHandle, IntPtr* ppException)
+        {
+            var _this = GetThis(thisHandle);
+            try
+            {
+                return _this.getLlvmModule();
+            }
+            catch (Exception ex)
+            {
+                *ppException = _this.AllocException(ex);
+                return default;
+            }
+        }
+
         [UnmanagedCallersOnly]
         static uint _getThreadTLSIndex(IntPtr thisHandle, IntPtr* ppException, void** ppIndirection)
         {
@@ -2536,7 +2551,7 @@ static uint _getJitFlags(IntPtr thisHandle, IntPtr* ppException, CORJIT_FLAGS* f
 
         static IntPtr GetUnmanagedCallbacks()
         {
-            void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 171);
+            void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 172);
 
             callbacks[0] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, uint>)&_getMethodAttribs;
             callbacks[1] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CorInfoMethodRuntimeFlags, void>)&_setMethodAttribs;
@@ -2655,60 +2670,61 @@ static IntPtr GetUnmanagedCallbacks()
             callbacks[114] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, uint>)&_getMethodHash;
             callbacks[115] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, byte*, UIntPtr, UIntPtr>)&_findNameOfToken;
             callbacks[116] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR*, byte>)&_getSystemVAmd64PassStructInRegisterDescriptor;
-            callbacks[117] = (delegate* unmanaged<IntPtr, IntPtr*, void**, uint>)&_getThreadTLSIndex;
-            callbacks[118] = (delegate* unmanaged<IntPtr, IntPtr*, void**, void*>)&_getInlinedCallFrameVptr;
-            callbacks[119] = (delegate* unmanaged<IntPtr, IntPtr*, void**, int*>)&_getAddrOfCaptureThreadGlobal;
-            callbacks[120] = (delegate* unmanaged<IntPtr, IntPtr*, CorInfoHelpFunc, void**, void*>)&_getHelperFtn;
-            callbacks[121] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, CORINFO_ACCESS_FLAGS, void>)&_getFunctionEntryPoint;
-            callbacks[122] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getFunctionFixedEntryPoint;
-            callbacks[123] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, void*>)&_getMethodSync;
-            callbacks[124] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CorInfoHelpFunc>)&_getLazyStringLiteralHelper;
-            callbacks[125] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, void**, CORINFO_MODULE_STRUCT_*>)&_embedModuleHandle;
-            callbacks[126] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, CORINFO_CLASS_STRUCT_*>)&_embedClassHandle;
-            callbacks[127] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, CORINFO_METHOD_STRUCT_*>)&_embedMethodHandle;
-            callbacks[128] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, CORINFO_FIELD_STRUCT_*>)&_embedFieldHandle;
-            callbacks[129] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, CORINFO_GENERICHANDLE_RESULT*, void>)&_embedGenericHandle;
-            callbacks[130] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_LOOKUP_KIND*, void>)&_getLocationOfThisType;
-            callbacks[131] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getAddressOfPInvokeTarget;
-            callbacks[132] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, void*>)&_GetCookieForPInvokeCalliSig;
-            callbacks[133] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetCookieForPInvokeCalliSig;
-            callbacks[134] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_JUST_MY_CODE_HANDLE_**, CORINFO_JUST_MY_CODE_HANDLE_*>)&_getJustMyCodeHandle;
-            callbacks[135] = (delegate* unmanaged<IntPtr, IntPtr*, bool*, void**, bool*, void>)&_GetProfilingHandle;
-            callbacks[136] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_RESOLVED_TOKEN*, CORINFO_METHOD_STRUCT_*, CORINFO_CALLINFO_FLAGS, CORINFO_CALL_INFO*, void>)&_getCallInfo;
-            callbacks[137] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, byte>)&_canAccessFamily;
-            callbacks[138] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, byte>)&_isRIDClassDomainID;
-            callbacks[139] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, uint>)&_getClassDomainID;
-            callbacks[140] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, void*>)&_getFieldAddress;
-            callbacks[141] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, byte*, CORINFO_CLASS_STRUCT_*>)&_getStaticFieldCurrentClass;
-            callbacks[142] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, IntPtr>)&_getVarArgsHandle;
-            callbacks[143] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetVarArgsHandle;
-            callbacks[144] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, void**, InfoAccessType>)&_constructStringLiteral;
-            callbacks[145] = (delegate* unmanaged<IntPtr, IntPtr*, void**, InfoAccessType>)&_emptyStringLiteral;
-            callbacks[146] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, uint>)&_getFieldThreadLocalStoreID;
-            callbacks[147] = (delegate* unmanaged<IntPtr, IntPtr*, IntPtr, CORINFO_METHOD_STRUCT_*, void>)&_setOverride;
-            callbacks[148] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CORINFO_MODULE_STRUCT_*, void>)&_addActiveDependency;
-            callbacks[149] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, CORINFO_METHOD_STRUCT_*, DelegateCtorArgs*, CORINFO_METHOD_STRUCT_*>)&_GetDelegateCtor;
-            callbacks[150] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void>)&_MethodCompileComplete;
-            callbacks[151] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_SIG_INFO*, CORINFO_GET_TAILCALL_HELPERS_FLAGS, CORINFO_TAILCALL_HELPERS*, byte>)&_getTailCallHelpers;
-            callbacks[152] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, byte>)&_convertPInvokeCalliToCall;
-            callbacks[153] = (delegate* unmanaged<IntPtr, IntPtr*, InstructionSet, byte, byte>)&_notifyInstructionSetUsage;
-            callbacks[154] = (delegate* unmanaged<IntPtr, IntPtr*, uint, uint, uint, uint, CorJitAllocMemFlag, void**, void**, void**, void>)&_allocMem;
-            callbacks[155] = (delegate* unmanaged<IntPtr, IntPtr*, byte, byte, uint, void>)&_reserveUnwindInfo;
-            callbacks[156] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, byte*, uint, uint, uint, byte*, CorJitFuncKind, void>)&_allocUnwindInfo;
-            callbacks[157] = (delegate* unmanaged<IntPtr, IntPtr*, UIntPtr, void*>)&_allocGCInfo;
-            callbacks[158] = (delegate* unmanaged<IntPtr, IntPtr*, uint, void>)&_setEHcount;
-            callbacks[159] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_EH_CLAUSE*, void>)&_setEHinfo;
-            callbacks[160] = (delegate* unmanaged<IntPtr, IntPtr*, uint, byte*, IntPtr, byte>)&_logMsg;
-            callbacks[161] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, int, byte*, int>)&_doAssert;
-            callbacks[162] = (delegate* unmanaged<IntPtr, IntPtr*, CorJitResult, void>)&_reportFatalError;
-            callbacks[163] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema**, uint*, byte**, HRESULT>)&_getPgoInstrumentationResults;
-            callbacks[164] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema*, uint, byte**, HRESULT>)&_allocPgoInstrumentationBySchema;
-            callbacks[165] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, uint, uint*, uint*, CORINFO_CLASS_STRUCT_*>)&_getLikelyClass;
-            callbacks[166] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_SIG_INFO*, CORINFO_METHOD_STRUCT_*, void>)&_recordCallSite;
-            callbacks[167] = (delegate* unmanaged<IntPtr, IntPtr*, void*, void*, ushort, ushort, int, void>)&_recordRelocation;
-            callbacks[168] = (delegate* unmanaged<IntPtr, IntPtr*, void*, ushort>)&_getRelocTypeHint;
-            callbacks[169] = (delegate* unmanaged<IntPtr, IntPtr*, uint>)&_getExpectedTargetArchitecture;
-            callbacks[170] = (delegate* unmanaged<IntPtr, IntPtr*, CORJIT_FLAGS*, uint, uint>)&_getJitFlags;
+            callbacks[117] = (delegate* unmanaged<IntPtr, IntPtr*, void*>)&_getLlvmModule;
+            callbacks[118] = (delegate* unmanaged<IntPtr, IntPtr*, void**, uint>)&_getThreadTLSIndex;
+            callbacks[119] = (delegate* unmanaged<IntPtr, IntPtr*, void**, void*>)&_getInlinedCallFrameVptr;
+            callbacks[120] = (delegate* unmanaged<IntPtr, IntPtr*, void**, int*>)&_getAddrOfCaptureThreadGlobal;
+            callbacks[121] = (delegate* unmanaged<IntPtr, IntPtr*, CorInfoHelpFunc, void**, void*>)&_getHelperFtn;
+            callbacks[122] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, CORINFO_ACCESS_FLAGS, void>)&_getFunctionEntryPoint;
+            callbacks[123] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getFunctionFixedEntryPoint;
+            callbacks[124] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, void*>)&_getMethodSync;
+            callbacks[125] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CorInfoHelpFunc>)&_getLazyStringLiteralHelper;
+            callbacks[126] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, void**, CORINFO_MODULE_STRUCT_*>)&_embedModuleHandle;
+            callbacks[127] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, CORINFO_CLASS_STRUCT_*>)&_embedClassHandle;
+            callbacks[128] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, CORINFO_METHOD_STRUCT_*>)&_embedMethodHandle;
+            callbacks[129] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, CORINFO_FIELD_STRUCT_*>)&_embedFieldHandle;
+            callbacks[130] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, CORINFO_GENERICHANDLE_RESULT*, void>)&_embedGenericHandle;
+            callbacks[131] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_LOOKUP_KIND*, void>)&_getLocationOfThisType;
+            callbacks[132] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getAddressOfPInvokeTarget;
+            callbacks[133] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, void*>)&_GetCookieForPInvokeCalliSig;
+            callbacks[134] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetCookieForPInvokeCalliSig;
+            callbacks[135] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_JUST_MY_CODE_HANDLE_**, CORINFO_JUST_MY_CODE_HANDLE_*>)&_getJustMyCodeHandle;
+            callbacks[136] = (delegate* unmanaged<IntPtr, IntPtr*, bool*, void**, bool*, void>)&_GetProfilingHandle;
+            callbacks[137] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_RESOLVED_TOKEN*, CORINFO_METHOD_STRUCT_*, CORINFO_CALLINFO_FLAGS, CORINFO_CALL_INFO*, void>)&_getCallInfo;
+            callbacks[138] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, byte>)&_canAccessFamily;
+            callbacks[139] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, byte>)&_isRIDClassDomainID;
+            callbacks[140] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, uint>)&_getClassDomainID;
+            callbacks[141] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, void*>)&_getFieldAddress;
+            callbacks[142] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, byte*, CORINFO_CLASS_STRUCT_*>)&_getStaticFieldCurrentClass;
+            callbacks[143] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, IntPtr>)&_getVarArgsHandle;
+            callbacks[144] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetVarArgsHandle;
+            callbacks[145] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, void**, InfoAccessType>)&_constructStringLiteral;
+            callbacks[146] = (delegate* unmanaged<IntPtr, IntPtr*, void**, InfoAccessType>)&_emptyStringLiteral;
+            callbacks[147] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, uint>)&_getFieldThreadLocalStoreID;
+            callbacks[148] = (delegate* unmanaged<IntPtr, IntPtr*, IntPtr, CORINFO_METHOD_STRUCT_*, void>)&_setOverride;
+            callbacks[149] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CORINFO_MODULE_STRUCT_*, void>)&_addActiveDependency;
+            callbacks[150] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, CORINFO_METHOD_STRUCT_*, DelegateCtorArgs*, CORINFO_METHOD_STRUCT_*>)&_GetDelegateCtor;
+            callbacks[151] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void>)&_MethodCompileComplete;
+            callbacks[152] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_SIG_INFO*, CORINFO_GET_TAILCALL_HELPERS_FLAGS, CORINFO_TAILCALL_HELPERS*, byte>)&_getTailCallHelpers;
+            callbacks[153] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, byte>)&_convertPInvokeCalliToCall;
+            callbacks[154] = (delegate* unmanaged<IntPtr, IntPtr*, InstructionSet, byte, byte>)&_notifyInstructionSetUsage;
+            callbacks[155] = (delegate* unmanaged<IntPtr, IntPtr*, uint, uint, uint, uint, CorJitAllocMemFlag, void**, void**, void**, void>)&_allocMem;
+            callbacks[156] = (delegate* unmanaged<IntPtr, IntPtr*, byte, byte, uint, void>)&_reserveUnwindInfo;
+            callbacks[157] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, byte*, uint, uint, uint, byte*, CorJitFuncKind, void>)&_allocUnwindInfo;
+            callbacks[158] = (delegate* unmanaged<IntPtr, IntPtr*, UIntPtr, void*>)&_allocGCInfo;
+            callbacks[159] = (delegate* unmanaged<IntPtr, IntPtr*, uint, void>)&_setEHcount;
+            callbacks[160] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_EH_CLAUSE*, void>)&_setEHinfo;
+            callbacks[161] = (delegate* unmanaged<IntPtr, IntPtr*, uint, byte*, IntPtr, byte>)&_logMsg;
+            callbacks[162] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, int, byte*, int>)&_doAssert;
+            callbacks[163] = (delegate* unmanaged<IntPtr, IntPtr*, CorJitResult, void>)&_reportFatalError;
+            callbacks[164] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema**, uint*, byte**, HRESULT>)&_getPgoInstrumentationResults;
+            callbacks[165] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema*, uint, byte**, HRESULT>)&_allocPgoInstrumentationBySchema;
+            callbacks[166] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, uint, uint*, uint*, CORINFO_CLASS_STRUCT_*>)&_getLikelyClass;
+            callbacks[167] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_SIG_INFO*, CORINFO_METHOD_STRUCT_*, void>)&_recordCallSite;
+            callbacks[168] = (delegate* unmanaged<IntPtr, IntPtr*, void*, void*, ushort, ushort, int, void>)&_recordRelocation;
+            callbacks[169] = (delegate* unmanaged<IntPtr, IntPtr*, void*, ushort>)&_getRelocTypeHint;
+            callbacks[170] = (delegate* unmanaged<IntPtr, IntPtr*, uint>)&_getExpectedTargetArchitecture;
+            callbacks[171] = (delegate* unmanaged<IntPtr, IntPtr*, CORJIT_FLAGS*, uint, uint>)&_getJitFlags;
 
             return (IntPtr)callbacks;
         }
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
index 48b42c899094..c92f427ca2bf 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
@@ -3455,6 +3455,11 @@ private uint getJitFlags(ref CORJIT_FLAGS flags, uint sizeInBytes)
             return (uint)sizeof(CORJIT_FLAGS);
         }
 
+        private void* getLlvmModule()
+        {
+            return _llvmModuleHandle.ToPointer();
+        }
+
 
 #if READYTORUN
         InstructionSetFlags _actualInstructionSetSupported;
diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
index 1b6b4e8f259c..3f11b86107c0 100644
--- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
+++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
@@ -278,6 +278,7 @@ FUNCTIONS
     unsigned getMethodHash(CORINFO_METHOD_HANDLE       ftn);
     size_t findNameOfToken(CORINFO_MODULE_HANDLE       moduleHandle,mdToken                     token, char * szFQName,size_t FQNameCapacity);
     bool getSystemVAmd64PassStructInRegisterDescriptor(CORINFO_CLASS_HANDLE  structHnd, SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr);
+    void* getLlvmModule();
     DWORD getThreadTLSIndex(void                  **ppIndirection);
     const void * getInlinedCallFrameVptr(void **ppIndirection);
     LONG * getAddrOfCaptureThreadGlobal(void                  **ppIndirection);
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 1dfd1edc2883..6e85510885dc 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -92,7 +92,7 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
 
         private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
         {
-            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
+            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this, Module.Handle));
 
             foreach (LLVMMethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
             {
@@ -116,11 +116,11 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
 
             try
             {
-            //     corInfo.CompileMethod(methodCodeNodeNeedingCode);
+                corInfo.CompileMethod(methodCodeNodeNeedingCode);
                 ryuJitMethodCount++;
-            // }
-            // catch (CodeGenerationFailedException)
-            // {
+            }
+            catch (CodeGenerationFailedException)
+            {
                 ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
             }
             catch (TypeSystemException ex)
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
index 0e68a3a09917..49a7951283b6 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
@@ -46,6 +46,7 @@ private struct SequencePoint
         private readonly UnboxingMethodDescFactory _unboxingThunkFactory = new UnboxingMethodDescFactory();
         private bool _isFallbackBodyCompilation;
         private DependencyList _additionalDependencies;
+        private IntPtr _llvmModuleHandle;
 
         public CorInfoImpl(RyuJitCompilation compilation)
             : this()
@@ -53,6 +54,13 @@ public CorInfoImpl(RyuJitCompilation compilation)
             _compilation = compilation;
         }
 
+        public CorInfoImpl(RyuJitCompilation compilation, IntPtr llvmModuleRef)
+            : this()
+        {
+            _compilation = compilation;
+            _llvmModuleHandle = llvmModuleRef;
+        }
+
         private MethodDesc getUnboxingThunk(MethodDesc method)
         {
             return _unboxingThunkFactory.GetUnboxingMethod(method);
diff --git a/src/coreclr/tools/aot/jitinterface/jitinterface.h b/src/coreclr/tools/aot/jitinterface/jitinterface.h
index 8d6df6f35572..fac1218130a4 100644
--- a/src/coreclr/tools/aot/jitinterface/jitinterface.h
+++ b/src/coreclr/tools/aot/jitinterface/jitinterface.h
@@ -127,6 +127,7 @@ struct JitInterfaceCallbacks
     unsigned (* getMethodHash)(void * thisHandle, CorInfoExceptionClass** ppException, void* ftn);
     size_t (* findNameOfToken)(void * thisHandle, CorInfoExceptionClass** ppException, void* moduleHandle, unsigned int token, char* szFQName, size_t FQNameCapacity);
     bool (* getSystemVAmd64PassStructInRegisterDescriptor)(void * thisHandle, CorInfoExceptionClass** ppException, void* structHnd, void* structPassInRegDescPtr);
+    void* (* getLlvmModule)(void * thisHandle, CorInfoExceptionClass** ppException);
     unsigned int (* getThreadTLSIndex)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
     const void* (* getInlinedCallFrameVptr)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
     long* (* getAddrOfCaptureThreadGlobal)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
@@ -1312,6 +1313,14 @@ class JitInterfaceWrapper
     return temp;
 }
 
+    virtual void* getLlvmModule()
+{
+    CorInfoExceptionClass* pException = nullptr;
+    void* temp = _callbacks->getLlvmModule(_thisHandle, &pException);
+    if (pException != nullptr) throw pException;
+    return temp;
+}
+
     virtual unsigned int getThreadTLSIndex(
           void** ppIndirection)
 {

From bf759ae25675cd8bc4c278255b84ca041df1d08d Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 27 Feb 2021 15:38:27 -0500
Subject: [PATCH 16/44] Will compile but wont link (no libLLVM.lib)

May also need before including any LLVM header files:

#undef NumItems
---
 eng/native/configurecompiler.cmake |  5 +++++
 eng/pipelines/runtimelab.yml       |  1 -
 src/coreclr/inc/crosscomp.h        |  4 ++++
 src/coreclr/inc/daccess.h          |  5 +++++
 src/coreclr/jit/CMakeLists.txt     | 12 ++++++++++++
 src/coreclr/jit/compiler.cpp       | 18 +++++++++++-------
 src/coreclr/jit/llvm.cpp           | 30 ++++++++++++++++++++++++++++++
 src/coreclr/jit/llvm.h             | 30 ++++++++++++++++++++++++++++++
 src/coreclr/jit/wasm.h             | 10 ++++++++++
 9 files changed, 107 insertions(+), 8 deletions(-)
 create mode 100644 src/coreclr/jit/llvm.cpp
 create mode 100644 src/coreclr/jit/llvm.h
 create mode 100644 src/coreclr/jit/wasm.h

diff --git a/eng/native/configurecompiler.cmake b/eng/native/configurecompiler.cmake
index 30ca6d11a8fc..354c14cf1252 100644
--- a/eng/native/configurecompiler.cmake
+++ b/eng/native/configurecompiler.cmake
@@ -505,6 +505,11 @@ if (MSVC)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4291>)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd5105>)
 
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4244>)
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4267>)
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4141>)
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4310>)
+
   # Treat Warnings as Errors:
   # 4007: 'main' : must be __cdecl.
   # 4013: 'function' undefined - assuming extern returning int.
diff --git a/eng/pipelines/runtimelab.yml b/eng/pipelines/runtimelab.yml
index 26ff64a54677..4dd3e2d9ee2a 100644
--- a/eng/pipelines/runtimelab.yml
+++ b/eng/pipelines/runtimelab.yml
@@ -79,7 +79,6 @@ stages:
         runtimeFlavor: coreclr
         platforms:
         - Browser_wasm
-        - windows_x64
         jobParameters:
           timeoutInMinutes: 90
           testGroup: innerloop
diff --git a/src/coreclr/inc/crosscomp.h b/src/coreclr/inc/crosscomp.h
index d5d8cc8bd70b..fd2c04c2bc06 100644
--- a/src/coreclr/inc/crosscomp.h
+++ b/src/coreclr/inc/crosscomp.h
@@ -26,6 +26,10 @@
 #endif
 #endif // TARGET_WINDOWS
 
+#ifdef TARGET_WASM
+#include "wasm.h"
+#endif
+
 #ifdef UNICODE
 #define MAKE_TARGET_DLLNAME(name) MAKE_TARGET_DLLNAME_W(name)
 #else
diff --git a/src/coreclr/inc/daccess.h b/src/coreclr/inc/daccess.h
index b0269592c9ef..d7b410113a6b 100644
--- a/src/coreclr/inc/daccess.h
+++ b/src/coreclr/inc/daccess.h
@@ -2371,6 +2371,11 @@ typedef S8PTR(const char)     PTR_CUTF8;
 typedef S16PTR(WCHAR)         PTR_WSTR;
 typedef S16PTR(const WCHAR)   PTR_CWSTR;
 
+#if TARGET_WASM
+#define T_CONTEXT CONTEXT
+#define PT_RUNTIME_FUNCTION PRUNTIME_FUNCTION
+#endif
+
 typedef DPTR(T_CONTEXT)                  PTR_CONTEXT;
 typedef DPTR(PTR_CONTEXT)                PTR_PTR_CONTEXT;
 typedef DPTR(struct _EXCEPTION_POINTERS) PTR_EXCEPTION_POINTERS;
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 89278bccc441..1f98303b0656 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -3,6 +3,10 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)
 include_directories("./jitstd")
 include_directories("../inc")
 
+find_package(LLVM REQUIRED CONFIG PATHS E:/GitHub/llvm-project/build/lib/cmake/llvm)
+include_directories(${LLVM_INCLUDE_DIRS})
+add_definitions(${LLVM_DEFINITIONS})
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fpermissive>)
   add_compile_options(-Wno-error)
@@ -69,10 +73,14 @@ function(create_standalone_jit)
   if (TARGETDETAILS_ARCH STREQUAL "wasm64")
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM64)
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
   endif ()
   if (TARGETDETAILS_ARCH STREQUAL "wasm32")
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM32)
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
+    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
   endif ()
 endfunction()
 
@@ -232,6 +240,7 @@ if (CLR_CMAKE_TARGET_WIN32)
     jitstd.h
     jittelemetry.h
     lir.h
+    llvm.h
     loopcloning.h
     loopcloningopts.h
     lower.h
@@ -272,6 +281,7 @@ if (CLR_CMAKE_TARGET_WIN32)
     valuenumtype.h
     varset.h
     vartype.h
+    wasm.h
   )
 
   if (CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_ARM)
@@ -323,6 +333,7 @@ set( JIT_WASM64_SOURCES
   unwindwasm.cpp
   hwintrinsicxarch.cpp
   hwintrinsiccodegenxarch.cpp
+  llvm.cpp
 )
 set( JIT_WASM32_SOURCES
   simd.cpp
@@ -332,6 +343,7 @@ set( JIT_WASM32_SOURCES
   unwindwasm.cpp
   hwintrinsicxarch.cpp
   hwintrinsiccodegenxarch.cpp
+  llvm.cpp
 )
 
 set( JIT_ARM_SOURCES
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 4c5ca7bbaf73..58530d69daea 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -23,6 +23,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "jittelemetry.h"
 #include "patchpointinfo.h"
 #include "jitstd/algorithm.h"
+#if defined(TARGET_WASM)
+#include "llvm.h"
+#endif
 
 #if defined(DEBUG)
 // Column settings for COMPlus_JitDumpIR.  We could(should) make these programmable.
@@ -4443,8 +4446,9 @@ void Compiler::EndPhase(Phases phase)
 #if defined(TARGET_WASM)
 inline void DoLlvmPhase(Compiler* pCompiler)
 {
-    void* llvmModule;
-    llvmModule = pCompiler->info.compCompHnd->getLlvmModule();
+    Llvm* llvm = new Llvm();
+    llvm->Compile(pCompiler);
+    delete llvm;
     fatal(CORJIT_SKIPPED);
     //assert(false);
 }
@@ -6632,7 +6636,7 @@ void Compiler::compInitVarScopeMap()
     compVarScopeMap = new (getAllocator()) VarNumToScopeDscMap(getAllocator());
 
     // 599 prime to limit huge allocations; for ex: duplicated scopes on single var.
-    compVarScopeMap->Reallocate(min(info.compVarScopesCount, 599));
+    compVarScopeMap->Reallocate(std::min(info.compVarScopesCount, 599U));
 
     for (unsigned i = 0; i < info.compVarScopesCount; ++i)
     {
@@ -7757,9 +7761,9 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
 
         // Update the totals and maxima.
         m_total.m_byteCodeBytes += info.m_byteCodeBytes;
-        m_maximum.m_byteCodeBytes = max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
+        m_maximum.m_byteCodeBytes = std::max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
         m_total.m_totalCycles += info.m_totalCycles;
-        m_maximum.m_totalCycles = max(m_maximum.m_totalCycles, info.m_totalCycles);
+        m_maximum.m_totalCycles = std::max(m_maximum.m_totalCycles, info.m_totalCycles);
 
 #if MEASURE_CLRAPI_CALLS
         // Update the CLR-API values.
@@ -7796,14 +7800,14 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
                 m_filtered.m_CLRcyclesByPhase[i] += info.m_CLRcyclesByPhase[i];
 #endif
             }
-            m_maximum.m_cyclesByPhase[i] = max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
+            m_maximum.m_cyclesByPhase[i] = std::max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
 
 #if MEASURE_CLRAPI_CALLS
             m_maximum.m_CLRcyclesByPhase[i] = max(m_maximum.m_CLRcyclesByPhase[i], info.m_CLRcyclesByPhase[i]);
 #endif
         }
         m_total.m_parentPhaseEndSlop += info.m_parentPhaseEndSlop;
-        m_maximum.m_parentPhaseEndSlop = max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
+        m_maximum.m_parentPhaseEndSlop = std::max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
     }
 #if MEASURE_CLRAPI_CALLS
     else
diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
new file mode 100644
index 000000000000..ac676f1c6714
--- /dev/null
+++ b/src/coreclr/jit/llvm.cpp
@@ -0,0 +1,30 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#ifdef TARGET_WASM
+#include "llvm.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "compiler.h"
+
+using llvm::Function;
+using llvm::FunctionType;
+using llvm::Type;
+using llvm::LLVMContext;
+using llvm::ArrayRef;
+using llvm::Module;
+
+//------------------------------------------------------------------------
+// Compile: Compile IR to LLVM, adding to the LLVM Module
+//
+void Llvm::Compile(Compiler* pCompiler)
+{
+    Module* llvmModule = (Module *)pCompiler->info.compCompHnd->getLlvmModule();
+    LLVMContext& llvmContext = llvmModule->getContext();
+
+    const char* mangledName = pCompiler->info.compCompHnd->getMethodName(pCompiler->info.compMethodHnd, NULL);
+    std::vector<Type*> argTypes;
+    Function* function = Function::Create(FunctionType::get(Type::getInt32Ty(llvmContext), ArrayRef<Type *>(argTypes), false), Function::InternalLinkage, 0U, mangledName, llvmModule);
+}
+
+#endif
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
new file mode 100644
index 000000000000..982f5ebbaf4e
--- /dev/null
+++ b/src/coreclr/jit/llvm.h
@@ -0,0 +1,30 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+/*****************************************************************************/
+#ifndef _LLVM_H_
+#define _LLVM_H_
+#undef __PLACEMENT_NEW_INLINE
+
+#include "alloc.h"
+#include "jitpch.h"
+#include <new>
+
+// these break std::min/max
+#undef min
+#undef max
+#include "llvm/ADT/APFloat.h"
+#ifdef TARGET_WASM
+
+
+//llvm::detail::DoubleAPFloat(const llvm::detail::DoubleAPFloat &) = default;
+
+class Llvm
+{
+public:
+    void Compile(Compiler* pCompiler);
+};
+
+#endif
+
+#endif /* End of _LLVM_H_ */
diff --git a/src/coreclr/jit/wasm.h b/src/coreclr/jit/wasm.h
new file mode 100644
index 000000000000..dfee7df9877d
--- /dev/null
+++ b/src/coreclr/jit/wasm.h
@@ -0,0 +1,10 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#ifndef WASM_H_
+#define WASM_H_
+
+#ifdef TARGET_WASM
+#endif
+
+#endif  // WASM_H_

From 991cbea7e4e4edc9358a2735b451793cc42b9033 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 27 Feb 2021 18:59:18 -0500
Subject: [PATCH 17/44] change llvm config path to 11 (hard coded for now)

---
 src/coreclr/jit/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 1f98303b0656..23c7f50c5e7c 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -3,7 +3,7 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)
 include_directories("./jitstd")
 include_directories("../inc")
 
-find_package(LLVM REQUIRED CONFIG PATHS E:/GitHub/llvm-project/build/lib/cmake/llvm)
+find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
 include_directories(${LLVM_INCLUDE_DIRS})
 add_definitions(${LLVM_DEFINITIONS})
 

From 6c5d807fd0f1c346f1b17c07c6ea11d9a3b2ed73 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 28 Feb 2021 11:40:46 -0500
Subject: [PATCH 18/44] Tidy CMakeLists.txt for jit and add LLVM libraries

Fix up some merge problems
---
 docs/workflow/building/coreclr/nativeaot.md   | 11 +++++++
 src/coreclr/jit/CMakeLists.txt                | 32 +++++++++----------
 src/coreclr/jit/jit.h                         |  2 +-
 .../DependencyAnalysis/LLVMMethodCodeNode.cs  |  2 ++
 .../Compiler/LLVMCodegenCompilation.cs        |  3 +-
 .../Compiler/LLVMCodegenCompilationBuilder.cs |  2 +-
 .../DependencyAnalysis/MethodCodeNode.cs      |  1 +
 7 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index 240e1a1e4715..c4315b917f94 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -14,6 +14,11 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - This branch contains a version of the WebAssembly compiler that creates LLVM from the clrjit to take advantage of RyuJits optimisations.  It goes from RyuJIT IR -> LLVM instead of the NativeAOT-LLVM branch way of CIL -> LLVM.
 - It does not work, yet or maybe never.
 - Currently only tested on Windows
+- Download the LLVM 11.0.0 source from https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/llvm-11.0.0.src.tar.xz
+- Extract and create a subdirectory in the llvm-11.0.0.src folder called build.  cd to this build folder
+- Configure the LLVM source to use the same runtime as clrjit `cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -D LLVM_USE_CRT_DEBUG=MTd ..`
+- Build LLVM either from the command line (`build`) or from VS 2019.  You only really need to build the LLVMCore project which is just 12 projects compared to the 400 odd projects when building all.  This will save some time.
+- Edit `src/coreclr/jit/CMakeLists.txt` and change `find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)` to where you have built LLVM
 - Build the x64 libraries and compiler as per the Building section.
 - Run `build nativeaot+libs+nativeaot.packages -rc [Debug|Release] -lc [Debug|Release] -a wasm -os Browser -runtimeFlavor CoreCLR`
 - The compiler can now be debugged with the Wasm clrjit.  Load the clrjit_browser_wasm32_x64.vcxproj which can be found in artifacts\obj\coreclr\windows.x64.Debug\jit
@@ -21,6 +26,12 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - Add the package directory to your `nuget.config` as above.
 - Run `dotnet publish -r browser-wasm -c [Debug|Release] /p:Platform=wasm` to publish.
 
+- To work on the clr jit for LLVM:
+- Open the Ilc solution and add the clr jit project `clrjit_browser_wasm32_x64.vcxproj` from `artifacts\obj\coreclr\windows.x64.Debug\jit`
+- In the project properties General section, change the output folder to the full path for `artifacts\bin\coreclr\windows.x64.Debug\ilc\net5.0` e.g. `E:\GitHub\runtimelab\artifacts\bin\coreclr\windows.x64.Debug\ilc\net5.0`
+- Build `clrjit_browser_wasm32_x64` project and you should now be able to change and but breakpoints in the c++ code.
+
+
 ## Visual Studio Solutions
 
 The repository has a number of Visual Studio Solutions files (`*.sln`) that are useful for editing parts of the repository. Build the repo from command line first before building using the solution files. Remember to select the appropriate configuration that you built. By default, `build.cmd` builds Debug x64 and so `Debug` and `x64` must be selected in the solution build configuration drop downs.
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index ea3673fe5ec7..69f599956fe9 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -3,9 +3,6 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)
 include_directories("./jitstd")
 include_directories("../inc")
 
-find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
-include_directories(${LLVM_INCLUDE_DIRS})
-add_definitions(${LLVM_DEFINITIONS})
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fpermissive>)
@@ -70,23 +67,26 @@ function(create_standalone_jit)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_HW_INTRINSICS)
   endif ()
 
-  if (TARGETDETAILS_ARCH STREQUAL "wasm64")
-    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
-    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM64)
-    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
-    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
-  endif ()
-  if (TARGETDETAILS_ARCH STREQUAL "wasm32")
+  if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
+    set(CLR_CMAKE_TARGET_ARCH_WASM 1)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM)
-    target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM32)
+    if (TARGETDETAILS_ARCH STREQUAL "wasm32")
+      target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM32)
+    elseif (TARGETDETAILS_ARCH STREQUAL "wasm64")
+      target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE TARGET_WASM64)
+    endif()
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
+
+    # TODO LLVM build location?
+    find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
+    include_directories(${LLVM_INCLUDE_DIRS})
+    add_definitions(${LLVM_DEFINITIONS})
+    llvm_map_components_to_libnames(llvm_libs core)
+    target_link_libraries(${TARGETDETAILS_TARGET} ${llvm_libs})
   endif ()
-endfunction()
 
-if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
-    set(CLR_CMAKE_TARGET_ARCH_WASM 1)
-endif ()
+endfunction()
 
 if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR (CLR_CMAKE_TARGET_ARCH_I386 AND NOT CLR_CMAKE_HOST_UNIX))
   add_compile_definitions($<$<NOT:$<BOOL:$<TARGET_PROPERTY:IGNORE_DEFAULT_TARGET_ARCH>>>:FEATURE_SIMD>)
@@ -566,4 +566,4 @@ if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_PGO_INSTRUMENT)
   find_path(PGORT_DIR ${PGORT_DLL} REQUIRED)
   _install(FILES "${PGORT_DIR}/${PGORT_DLL}" DESTINATION ${CMAKE_INSTALL_PREFIX})
   _install(FILES "${PGORT_DIR}/${PGORT_DLL}" DESTINATION ${CMAKE_INSTALL_PREFIX}/sharedFramework)
-endif ()
\ No newline at end of file
+endif ()
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index c16dec796862..f6323d816d63 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -147,7 +147,7 @@
 #if !defined(HOST_ARM64)
 #define _CROSS_COMPILER_
 #endif
-#elif defined(TARGET_WASM32)
+#elif defined(TARGET_WASM)
 #else
 #error Unsupported or unset target architecture
 #endif
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
index d977ffe6ef52..db622e4c3564 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
@@ -90,6 +90,8 @@ public void InitializeDebugVarInfos(DebugVarInfo[] debugVarInfos)
         public void InitializeNonRelocationDependencies(DependencyList additionalDependencies)
         {
         }
+
+        public void InitializeIsStateMachineMoveNextMethod(bool debugInfoIsStateMachineMoveNextMethod) => throw new System.NotImplementedException();
     }
 
     internal class LlvmMethodBodyNode : LLVMMethodCodeNode, IMethodBodyNode
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index d7cf12e18a45..62f9ae02b441 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -34,10 +34,11 @@ internal LLVMCodegenCompilation(DependencyAnalyzerBase<NodeFactory> dependencyGr
             ILProvider ilProvider,
             DebugInformationProvider debugInformationProvider,
             Logger logger,
+            LLVMCodegenConfigProvider options,
             IInliningPolicy inliningPolicy,
             DevirtualizationManager devirtualizationManager,
             InstructionSetSupport instructionSetSupport)
-            : base(dependencyGraph, nodeFactory, GetCompilationRoots(roots, nodeFactory), ilProvider, debugInformationProvider, logger, devirtualizationManager, instructionSetSupport, 0)
+            : base(dependencyGraph, nodeFactory, GetCompilationRoots(roots, nodeFactory), ilProvider, debugInformationProvider, logger, devirtualizationManager, inliningPolicy, instructionSetSupport, 0)
         {
             NodeFactory = nodeFactory;
             LLVMModuleRef m = LLVMModuleRef.CreateWithName(options.ModuleName);
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
index 430cdf9a3afb..2102d1da524b 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
@@ -68,7 +68,7 @@ public override ICompilation ToCompilation()
             LLVMCodegenNodeFactory factory = new LLVMCodegenNodeFactory(_context, _compilationGroup, _metadataManager, _interopStubManager, _nameMangler, _vtableSliceProvider, _dictionaryLayoutProvider, GetPreinitializationManager());
             JitConfigProvider.Initialize(_context.Target, jitFlagBuilder.ToArray(), _ryujitOptions);
             DependencyAnalyzerBase<NodeFactory> graph = CreateDependencyGraph(factory, new ObjectNode.ObjectNodeComparer(new CompilerComparer()));
-            return new LLVMCodegenCompilation(graph, factory, _compilationRoots, _ilProvider, _debugInformationProvider, _logger, _config, _devirtualizationManager, _instructionSetSupport);
+            return new LLVMCodegenCompilation(graph, factory, _compilationRoots, _ilProvider, _debugInformationProvider, _logger, _config, _inliningPolicy, _devirtualizationManager, _instructionSetSupport);
         }
     }
 
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
index 1bc77304994f..bc4394bded5d 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/Compiler/DependencyAnalysis/MethodCodeNode.cs
@@ -19,6 +19,7 @@ public interface IMethodCodeNode : IMethodNode, ISymbolDefinitionNode
         void InitializeDebugLocInfos(DebugLocInfo[] debugLocInfos);
         void InitializeDebugVarInfos(DebugVarInfo[] debugVarInfos);
         void InitializeNonRelocationDependencies(DependencyNodeCore<NodeFactory>.DependencyList additionalDependencies);
+        void InitializeIsStateMachineMoveNextMethod(bool debugInfoIsStateMachineMoveNextMethod);
     }
 
     [DebuggerTypeProxy(typeof(MethodCodeNodeDebugView))]

From 5035bd7f56af27a623ac988cb879b85248376703 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 5 Mar 2021 22:10:05 -0500
Subject: [PATCH 19/44] revert jit interface

add callback for mangled name
---
 docs/workflow/building/coreclr/nativeaot.md   |   2 +-
 eng/native/configurecompiler.cmake            |   3 +
 .../superpmi-shim-counter/icorjitinfo.cpp     |   6 -
 .../superpmi-shim-simple/icorjitinfo.cpp      |   5 -
 src/coreclr/inc/corinfo.h                     |   4 -
 src/coreclr/inc/corjit.h                      |   3 +
 src/coreclr/inc/icorjitinfoimpl_generated.h   |   2 -
 src/coreclr/inc/jiteeversionguid.h            |  10 +-
 src/coreclr/jit/CMakeLists.txt                |   2 +-
 src/coreclr/jit/ICorJitInfo_API_names.h       |   1 -
 src/coreclr/jit/ICorJitInfo_API_wrapper.hpp   |   8 --
 src/coreclr/jit/compiler.cpp                  |  16 ++-
 src/coreclr/jit/ee_il_dll.cpp                 |   3 +
 src/coreclr/jit/llvm.cpp                      |  55 +++++++-
 src/coreclr/jit/llvm.h                        |   5 +-
 .../tools/Common/JitInterface/CorInfoBase.cs  | 124 ++++++++----------
 .../tools/Common/JitInterface/CorInfoImpl.cs  |  14 +-
 .../ThunkGenerator/ThunkInput.txt             |   1 -
 .../Compiler/LLVMCodegenCompilation.cs        |  12 +-
 .../ILCompiler.RyuJit.csproj                  |   4 +
 .../JitInterface/CorInfoImpl.Llvm.cs          |  35 +++++
 .../JitInterface/CorInfoImpl.RyuJit.cs        |   8 --
 .../tools/aot/jitinterface/jitinterface.h     |   9 --
 23 files changed, 191 insertions(+), 141 deletions(-)
 create mode 100644 src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index c4315b917f94..9271f76e92b1 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -17,7 +17,7 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - Download the LLVM 11.0.0 source from https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/llvm-11.0.0.src.tar.xz
 - Extract and create a subdirectory in the llvm-11.0.0.src folder called build.  cd to this build folder
 - Configure the LLVM source to use the same runtime as clrjit `cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -D LLVM_USE_CRT_DEBUG=MTd ..`
-- Build LLVM either from the command line (`build`) or from VS 2019.  You only really need to build the LLVMCore project which is just 12 projects compared to the 400 odd projects when building all.  This will save some time.
+- Build LLVM either from the command line (`build`) or from VS 2019.  You only really need to build the LLVMCore and LLVMBitWriter projects which takes less time than the 400 odd projects when building all.  This will save some time.
 - Edit `src/coreclr/jit/CMakeLists.txt` and change `find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)` to where you have built LLVM
 - Build the x64 libraries and compiler as per the Building section.
 - Run `build nativeaot+libs+nativeaot.packages -rc [Debug|Release] -lc [Debug|Release] -a wasm -os Browser -runtimeFlavor CoreCLR`
diff --git a/eng/native/configurecompiler.cmake b/eng/native/configurecompiler.cmake
index 6dd7d73a83cd..5245fc90e62c 100644
--- a/eng/native/configurecompiler.cmake
+++ b/eng/native/configurecompiler.cmake
@@ -530,10 +530,13 @@ if (MSVC)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4291>)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd5105>)
 
+  # TODO: if for LLVM
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4244>)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4267>)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4141>)
   add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4310>)
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4624>) # destructor was implicitly defined as deleted
+  add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/wd4324>) # structure was padded due to alignment specifier
 
   # Treat Warnings as Errors:
   # 4007: 'main' : must be __cdecl.
diff --git a/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
index f4df78db738f..674d9f4e1fd1 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi-shim-counter/icorjitinfo.cpp
@@ -951,12 +951,6 @@ uint32_t interceptor_ICJI::getThreadTLSIndex(
     return original_ICorJitInfo->getThreadTLSIndex(ppIndirection);
 }
 
-void* interceptor_ICJI::getLlvmModule()
-{
-    mcs->AddCall("getLlvmModule");
-    return original_ICorJitInfo->getLlvmModule();
-}
-
 const void* interceptor_ICJI::getInlinedCallFrameVptr(
           void** ppIndirection)
 {
diff --git a/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
index 1b9ebe4d076c..a276453ba69c 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi-shim-simple/icorjitinfo.cpp
@@ -832,11 +832,6 @@ uint32_t interceptor_ICJI::getThreadTLSIndex(
     return original_ICorJitInfo->getThreadTLSIndex(ppIndirection);
 }
 
-void* interceptor_ICJI::getLlvmModule()
-{
-    return original_ICorJitInfo->getLlvmModule();
-}
-
 const void* interceptor_ICJI::getInlinedCallFrameVptr(
           void** ppIndirection)
 {
diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h
index 45d86b669da0..c7922d3fff57 100644
--- a/src/coreclr/inc/corinfo.h
+++ b/src/coreclr/inc/corinfo.h
@@ -2844,10 +2844,6 @@ class ICorStaticInfo
         /* OUT */   SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr
         ) = 0;
 
-    // return the LLVM Module.  This is static as there is only one module.
-    virtual void* getLlvmModule(
-    ) = 0;
-
 };
 
 /*****************************************************************************
diff --git a/src/coreclr/inc/corjit.h b/src/coreclr/inc/corjit.h
index 85cb80bc3700..f516aafa3a26 100644
--- a/src/coreclr/inc/corjit.h
+++ b/src/coreclr/inc/corjit.h
@@ -145,6 +145,9 @@ enum CheckedWriteBarrierKinds {
 #include "corjithost.h"
 
 extern "C" void jitStartup(ICorJitHost* host);
+#if TARGET_WASM
+extern "C" void jitShutdown(bool processIsTerminating);
+#endif
 
 class ICorJitCompiler;
 class ICorJitInfo;
diff --git a/src/coreclr/inc/icorjitinfoimpl_generated.h b/src/coreclr/inc/icorjitinfoimpl_generated.h
index a225ea65f2b3..beb71ca0d3fa 100644
--- a/src/coreclr/inc/icorjitinfoimpl_generated.h
+++ b/src/coreclr/inc/icorjitinfoimpl_generated.h
@@ -484,8 +484,6 @@ bool getSystemVAmd64PassStructInRegisterDescriptor(
 uint32_t getThreadTLSIndex(
           void** ppIndirection) override;
 
-void* getLlvmModule() override;
-
 const void* getInlinedCallFrameVptr(
           void** ppIndirection) override;
 
diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h
index af3532c3b393..8d356ee92323 100644
--- a/src/coreclr/inc/jiteeversionguid.h
+++ b/src/coreclr/inc/jiteeversionguid.h
@@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
 #define GUID_DEFINED
 #endif // !GUID_DEFINED
 
-constexpr GUID JITEEVersionIdentifier = { /* c766b7f8-7671-4eef-9d5d-c2242e850c26 */
-    0xc766b7f8,
-    0x7671,
-    0x4eef,
-    {0x9d, 0x5d, 0xc2, 0x24, 0x2e, 0x85, 0x0c, 0x26}
+constexpr GUID JITEEVersionIdentifier = { /* ba99f659-11ae-4c05-bdad-650cb5104f26 */
+    0xba99f659,
+    0x11ae,
+    0x4c05,
+    {0xbd, 0xad, 0x65, 0x0c, 0xb5, 0x10, 0x4f, 0x26}
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 69f599956fe9..f3695da617b1 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -82,7 +82,7 @@ function(create_standalone_jit)
     find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
     include_directories(${LLVM_INCLUDE_DIRS})
     add_definitions(${LLVM_DEFINITIONS})
-    llvm_map_components_to_libnames(llvm_libs core)
+    llvm_map_components_to_libnames(llvm_libs core bitwriter)
     target_link_libraries(${TARGETDETAILS_TARGET} ${llvm_libs})
   endif ()
 
diff --git a/src/coreclr/jit/ICorJitInfo_API_names.h b/src/coreclr/jit/ICorJitInfo_API_names.h
index 5f46a3098918..2e1e67cc24a7 100644
--- a/src/coreclr/jit/ICorJitInfo_API_names.h
+++ b/src/coreclr/jit/ICorJitInfo_API_names.h
@@ -123,7 +123,6 @@ DEF_CLR_API(getMethodHash)
 DEF_CLR_API(findNameOfToken)
 DEF_CLR_API(getSystemVAmd64PassStructInRegisterDescriptor)
 DEF_CLR_API(getThreadTLSIndex)
-DEF_CLR_API(getLlvmModule)
 DEF_CLR_API(getInlinedCallFrameVptr)
 DEF_CLR_API(getAddrOfCaptureThreadGlobal)
 DEF_CLR_API(getHelperFtn)
diff --git a/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp b/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
index 71c2bb536e62..d80bb83ef059 100644
--- a/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
+++ b/src/coreclr/jit/ICorJitInfo_API_wrapper.hpp
@@ -1163,14 +1163,6 @@ uint32_t WrapICorJitInfo::getThreadTLSIndex(
     return temp;
 }
 
-void* WrapICorJitInfo::getLlvmModule()
-{
-    API_ENTER(getLlvmModule);
-    void* temp = wrapHnd->getLlvmModule();
-    API_LEAVE(getLlvmModule);
-    return temp;
-}
-
 const void* WrapICorJitInfo::getInlinedCallFrameVptr(
           void** ppIndirection)
 {
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9a58104b54ee..ab93db20b1ca 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -25,6 +25,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "jitstd/algorithm.h"
 #if defined(TARGET_WASM)
 #include "llvm.h"
+#else
+// TODO: how to get different exports.def for the different clrjits?
+void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
+{
+}
 #endif
 
 #if defined(DEBUG)
@@ -1504,7 +1509,9 @@ void Compiler::compStartup()
 #endif
 
     /* Initialize the emitter */
-#ifndef TARGET_WASM
+#ifdef TARGET_WASM
+    Llvm::Init();
+#else
     emitter::emitInit();
 #endif // !TARGET_WASM
 
@@ -1540,7 +1547,9 @@ void Compiler::compShutdown()
     DisplayNowayAssertMap();
 #endif // MEASURE_NOWAY
 
-#ifndef TARGET_WASM
+#ifdef TARGET_WASM
+    Llvm::llvmShutdown();
+#else
     /* Shut down the emitter */
 
     emitter::emitDone();
@@ -4438,7 +4447,6 @@ inline void DoLlvmPhase(Compiler* pCompiler)
     Llvm* llvm = new Llvm();
     llvm->Compile(pCompiler);
     delete llvm;
-    fatal(CORJIT_SKIPPED);
     //assert(false);
 }
 #endif
@@ -5768,7 +5776,7 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
     }
     endErrorTrap() // ERROR TRAP: End
 
-        return param.result;
+    return param.result;
 }
 
 #if defined(DEBUG) || defined(INLINE_DATA)
diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp
index e629b10b7baf..3cd02b1f0068 100644
--- a/src/coreclr/jit/ee_il_dll.cpp
+++ b/src/coreclr/jit/ee_il_dll.cpp
@@ -123,6 +123,9 @@ extern "C" DLLEXPORT void jitStartup(ICorJitHost* jitHost)
     g_jitInitialized = true;
 }
 
+#if TARGET_WASM
+extern "C" DLLEXPORT
+#endif
 void jitShutdown(bool processIsTerminating)
 {
     if (!g_jitInitialized)
diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
index ac676f1c6714..bb68bb3ae40b 100644
--- a/src/coreclr/jit/llvm.cpp
+++ b/src/coreclr/jit/llvm.cpp
@@ -2,10 +2,14 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 #ifdef TARGET_WASM
+#include "compiler.h"
 #include "llvm.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Function.h"
-#include "compiler.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 
 using llvm::Function;
 using llvm::FunctionType;
@@ -14,17 +18,54 @@ using llvm::LLVMContext;
 using llvm::ArrayRef;
 using llvm::Module;
 
+static Module* _module;
+static LLVMContext _llvmContext;
+static void* _thisPtr;
+static const char* (*_getMangledMethodName)(void*, CORINFO_METHOD_STRUCT_*);
+
+extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
+{
+    _thisPtr = thisPtr;
+    _getMangledMethodName = getMangledMethodNamePtr;
+}
+
+void Llvm::Init()
+{
+    _module = new Module(llvm::StringRef("netscripten-clrjit"), _llvmContext);
+}
+
+void Llvm::llvmShutdown()
+{
+#if DEBUG
+    _module->dump();
+#endif //DEBUG
+    std::error_code ec;
+    llvm::raw_fd_ostream OS("module.bc", ec);
+    llvm::WriteBitcodeToFile(*_module, OS);
+//    Module.Verify(LLVMVerifierFailureAction.LLVMAbortProcessAction);
+
+    //Module.WriteBitcodeToFile(_objectFilePath);
+}
+
 //------------------------------------------------------------------------
 // Compile: Compile IR to LLVM, adding to the LLVM Module
 //
 void Llvm::Compile(Compiler* pCompiler)
 {
-    Module* llvmModule = (Module *)pCompiler->info.compCompHnd->getLlvmModule();
-    LLVMContext& llvmContext = llvmModule->getContext();
+    Compiler::Info info = pCompiler->info;
 
-    const char* mangledName = pCompiler->info.compCompHnd->getMethodName(pCompiler->info.compMethodHnd, NULL);
-    std::vector<Type*> argTypes;
-    Function* function = Function::Create(FunctionType::get(Type::getInt32Ty(llvmContext), ArrayRef<Type *>(argTypes), false), Function::InternalLinkage, 0U, mangledName, llvmModule);
-}
+    //TODO: delete
+    if (info.compArgsCount != 0 || info.compRetType != TYP_VOID)
+    {
+        fatal(CORJIT_SKIPPED);
+    }
+    // TODO: use of getMethodName is wrong as its only for debug purposes.
+    const char* mangledName = (*_getMangledMethodName)(_thisPtr, info.compMethodHnd);
+    Function* function = Function::Create(FunctionType::get(Type::getVoidTy(_llvmContext), ArrayRef<Type*>(), false), Function::InternalLinkage, 0U, mangledName, _module);
 
+    llvm::IRBuilder<> builder(_llvmContext);
+    llvm::BasicBlock* entry = llvm::BasicBlock::Create(_llvmContext, "", function);
+    builder.SetInsertPoint(entry);
+    builder.CreateRetVoid();
+}
 #endif
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index f4fb359f67a1..dedaef2146dc 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -18,11 +18,14 @@
 #ifdef TARGET_WASM
 
 
-//llvm::detail::DoubleAPFloat(const llvm::detail::DoubleAPFloat &) = default;
+extern "C" void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
 
 class Llvm
 {
 public:
+    static void Init();
+    static void llvmShutdown();
+
     void Compile(Compiler* pCompiler);
 };
 
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
index 1454250b995d..80fc5e6ea34a 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoBase.cs
@@ -1771,21 +1771,6 @@ static uint _getThreadTLSIndex(IntPtr thisHandle, IntPtr* ppException, void** pp
             }
         }
 
-        [UnmanagedCallersOnly]
-        static void* _getLlvmModule(IntPtr thisHandle, IntPtr* ppException)
-        {
-            var _this = GetThis(thisHandle);
-            try
-            {
-                return _this.getLlvmModule();
-            }
-            catch (Exception ex)
-            {
-                *ppException = _this.AllocException(ex);
-                return default;
-            }
-        }
-
         [UnmanagedCallersOnly]
         static void* _getInlinedCallFrameVptr(IntPtr thisHandle, IntPtr* ppException, void** ppIndirection)
         {
@@ -2566,7 +2551,7 @@ static uint _getJitFlags(IntPtr thisHandle, IntPtr* ppException, CORJIT_FLAGS* f
 
         static IntPtr GetUnmanagedCallbacks()
         {
-            void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 173);
+            void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 172);
 
             callbacks[0] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, uint>)&_getMethodAttribs;
             callbacks[1] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CorInfoMethodRuntimeFlags, void>)&_setMethodAttribs;
@@ -2687,60 +2672,59 @@ static IntPtr GetUnmanagedCallbacks()
             callbacks[116] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, byte*, UIntPtr, UIntPtr>)&_findNameOfToken;
             callbacks[117] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR*, byte>)&_getSystemVAmd64PassStructInRegisterDescriptor;
             callbacks[118] = (delegate* unmanaged<IntPtr, IntPtr*, void**, uint>)&_getThreadTLSIndex;
-            callbacks[119] = (delegate* unmanaged<IntPtr, IntPtr*, void*>)&_getLlvmModule;
-            callbacks[120] = (delegate* unmanaged<IntPtr, IntPtr*, void**, void*>)&_getInlinedCallFrameVptr;
-            callbacks[121] = (delegate* unmanaged<IntPtr, IntPtr*, void**, int*>)&_getAddrOfCaptureThreadGlobal;
-            callbacks[122] = (delegate* unmanaged<IntPtr, IntPtr*, CorInfoHelpFunc, void**, void*>)&_getHelperFtn;
-            callbacks[123] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, CORINFO_ACCESS_FLAGS, void>)&_getFunctionEntryPoint;
-            callbacks[124] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getFunctionFixedEntryPoint;
-            callbacks[125] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, void*>)&_getMethodSync;
-            callbacks[126] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CorInfoHelpFunc>)&_getLazyStringLiteralHelper;
-            callbacks[127] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, void**, CORINFO_MODULE_STRUCT_*>)&_embedModuleHandle;
-            callbacks[128] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, CORINFO_CLASS_STRUCT_*>)&_embedClassHandle;
-            callbacks[129] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, CORINFO_METHOD_STRUCT_*>)&_embedMethodHandle;
-            callbacks[130] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, CORINFO_FIELD_STRUCT_*>)&_embedFieldHandle;
-            callbacks[131] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, CORINFO_GENERICHANDLE_RESULT*, void>)&_embedGenericHandle;
-            callbacks[132] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_LOOKUP_KIND*, void>)&_getLocationOfThisType;
-            callbacks[133] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getAddressOfPInvokeTarget;
-            callbacks[134] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, void*>)&_GetCookieForPInvokeCalliSig;
-            callbacks[135] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetCookieForPInvokeCalliSig;
-            callbacks[136] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_JUST_MY_CODE_HANDLE_**, CORINFO_JUST_MY_CODE_HANDLE_*>)&_getJustMyCodeHandle;
-            callbacks[137] = (delegate* unmanaged<IntPtr, IntPtr*, bool*, void**, bool*, void>)&_GetProfilingHandle;
-            callbacks[138] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_RESOLVED_TOKEN*, CORINFO_METHOD_STRUCT_*, CORINFO_CALLINFO_FLAGS, CORINFO_CALL_INFO*, void>)&_getCallInfo;
-            callbacks[139] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, byte>)&_canAccessFamily;
-            callbacks[140] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, byte>)&_isRIDClassDomainID;
-            callbacks[141] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, uint>)&_getClassDomainID;
-            callbacks[142] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, void*>)&_getFieldAddress;
-            callbacks[143] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, byte*, CORINFO_CLASS_STRUCT_*>)&_getStaticFieldCurrentClass;
-            callbacks[144] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, IntPtr>)&_getVarArgsHandle;
-            callbacks[145] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetVarArgsHandle;
-            callbacks[146] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, void**, InfoAccessType>)&_constructStringLiteral;
-            callbacks[147] = (delegate* unmanaged<IntPtr, IntPtr*, void**, InfoAccessType>)&_emptyStringLiteral;
-            callbacks[148] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, uint>)&_getFieldThreadLocalStoreID;
-            callbacks[149] = (delegate* unmanaged<IntPtr, IntPtr*, IntPtr, CORINFO_METHOD_STRUCT_*, void>)&_setOverride;
-            callbacks[150] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CORINFO_MODULE_STRUCT_*, void>)&_addActiveDependency;
-            callbacks[151] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, CORINFO_METHOD_STRUCT_*, DelegateCtorArgs*, CORINFO_METHOD_STRUCT_*>)&_GetDelegateCtor;
-            callbacks[152] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void>)&_MethodCompileComplete;
-            callbacks[153] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_SIG_INFO*, CORINFO_GET_TAILCALL_HELPERS_FLAGS, CORINFO_TAILCALL_HELPERS*, byte>)&_getTailCallHelpers;
-            callbacks[154] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, byte>)&_convertPInvokeCalliToCall;
-            callbacks[155] = (delegate* unmanaged<IntPtr, IntPtr*, InstructionSet, byte, byte>)&_notifyInstructionSetUsage;
-            callbacks[156] = (delegate* unmanaged<IntPtr, IntPtr*, uint, uint, uint, uint, CorJitAllocMemFlag, void**, void**, void**, void>)&_allocMem;
-            callbacks[157] = (delegate* unmanaged<IntPtr, IntPtr*, byte, byte, uint, void>)&_reserveUnwindInfo;
-            callbacks[158] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, byte*, uint, uint, uint, byte*, CorJitFuncKind, void>)&_allocUnwindInfo;
-            callbacks[159] = (delegate* unmanaged<IntPtr, IntPtr*, UIntPtr, void*>)&_allocGCInfo;
-            callbacks[160] = (delegate* unmanaged<IntPtr, IntPtr*, uint, void>)&_setEHcount;
-            callbacks[161] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_EH_CLAUSE*, void>)&_setEHinfo;
-            callbacks[162] = (delegate* unmanaged<IntPtr, IntPtr*, uint, byte*, IntPtr, byte>)&_logMsg;
-            callbacks[163] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, int, byte*, int>)&_doAssert;
-            callbacks[164] = (delegate* unmanaged<IntPtr, IntPtr*, CorJitResult, void>)&_reportFatalError;
-            callbacks[165] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema**, uint*, byte**, HRESULT>)&_getPgoInstrumentationResults;
-            callbacks[166] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema*, uint, byte**, HRESULT>)&_allocPgoInstrumentationBySchema;
-            callbacks[167] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, uint, uint*, uint*, CORINFO_CLASS_STRUCT_*>)&_getLikelyClass;
-            callbacks[168] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_SIG_INFO*, CORINFO_METHOD_STRUCT_*, void>)&_recordCallSite;
-            callbacks[169] = (delegate* unmanaged<IntPtr, IntPtr*, void*, void*, ushort, ushort, int, void>)&_recordRelocation;
-            callbacks[170] = (delegate* unmanaged<IntPtr, IntPtr*, void*, ushort>)&_getRelocTypeHint;
-            callbacks[171] = (delegate* unmanaged<IntPtr, IntPtr*, uint>)&_getExpectedTargetArchitecture;
-            callbacks[172] = (delegate* unmanaged<IntPtr, IntPtr*, CORJIT_FLAGS*, uint, uint>)&_getJitFlags;
+            callbacks[119] = (delegate* unmanaged<IntPtr, IntPtr*, void**, void*>)&_getInlinedCallFrameVptr;
+            callbacks[120] = (delegate* unmanaged<IntPtr, IntPtr*, void**, int*>)&_getAddrOfCaptureThreadGlobal;
+            callbacks[121] = (delegate* unmanaged<IntPtr, IntPtr*, CorInfoHelpFunc, void**, void*>)&_getHelperFtn;
+            callbacks[122] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, CORINFO_ACCESS_FLAGS, void>)&_getFunctionEntryPoint;
+            callbacks[123] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getFunctionFixedEntryPoint;
+            callbacks[124] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, void*>)&_getMethodSync;
+            callbacks[125] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CorInfoHelpFunc>)&_getLazyStringLiteralHelper;
+            callbacks[126] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, void**, CORINFO_MODULE_STRUCT_*>)&_embedModuleHandle;
+            callbacks[127] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, CORINFO_CLASS_STRUCT_*>)&_embedClassHandle;
+            callbacks[128] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void**, CORINFO_METHOD_STRUCT_*>)&_embedMethodHandle;
+            callbacks[129] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, CORINFO_FIELD_STRUCT_*>)&_embedFieldHandle;
+            callbacks[130] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, CORINFO_GENERICHANDLE_RESULT*, void>)&_embedGenericHandle;
+            callbacks[131] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_LOOKUP_KIND*, void>)&_getLocationOfThisType;
+            callbacks[132] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CONST_LOOKUP*, void>)&_getAddressOfPInvokeTarget;
+            callbacks[133] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, void*>)&_GetCookieForPInvokeCalliSig;
+            callbacks[134] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetCookieForPInvokeCalliSig;
+            callbacks[135] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_JUST_MY_CODE_HANDLE_**, CORINFO_JUST_MY_CODE_HANDLE_*>)&_getJustMyCodeHandle;
+            callbacks[136] = (delegate* unmanaged<IntPtr, IntPtr*, bool*, void**, bool*, void>)&_GetProfilingHandle;
+            callbacks[137] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_RESOLVED_TOKEN*, CORINFO_METHOD_STRUCT_*, CORINFO_CALLINFO_FLAGS, CORINFO_CALL_INFO*, void>)&_getCallInfo;
+            callbacks[138] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, byte>)&_canAccessFamily;
+            callbacks[139] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, byte>)&_isRIDClassDomainID;
+            callbacks[140] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_CLASS_STRUCT_*, void**, uint>)&_getClassDomainID;
+            callbacks[141] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, void*>)&_getFieldAddress;
+            callbacks[142] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, byte*, CORINFO_CLASS_STRUCT_*>)&_getStaticFieldCurrentClass;
+            callbacks[143] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, void**, IntPtr>)&_getVarArgsHandle;
+            callbacks[144] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_SIG_INFO*, byte>)&_canGetVarArgsHandle;
+            callbacks[145] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, mdToken, void**, InfoAccessType>)&_constructStringLiteral;
+            callbacks[146] = (delegate* unmanaged<IntPtr, IntPtr*, void**, InfoAccessType>)&_emptyStringLiteral;
+            callbacks[147] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_FIELD_STRUCT_*, void**, uint>)&_getFieldThreadLocalStoreID;
+            callbacks[148] = (delegate* unmanaged<IntPtr, IntPtr*, IntPtr, CORINFO_METHOD_STRUCT_*, void>)&_setOverride;
+            callbacks[149] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_MODULE_STRUCT_*, CORINFO_MODULE_STRUCT_*, void>)&_addActiveDependency;
+            callbacks[150] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, CORINFO_METHOD_STRUCT_*, DelegateCtorArgs*, CORINFO_METHOD_STRUCT_*>)&_GetDelegateCtor;
+            callbacks[151] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, void>)&_MethodCompileComplete;
+            callbacks[152] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, CORINFO_SIG_INFO*, CORINFO_GET_TAILCALL_HELPERS_FLAGS, CORINFO_TAILCALL_HELPERS*, byte>)&_getTailCallHelpers;
+            callbacks[153] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_RESOLVED_TOKEN*, byte, byte>)&_convertPInvokeCalliToCall;
+            callbacks[154] = (delegate* unmanaged<IntPtr, IntPtr*, InstructionSet, byte, byte>)&_notifyInstructionSetUsage;
+            callbacks[155] = (delegate* unmanaged<IntPtr, IntPtr*, uint, uint, uint, uint, CorJitAllocMemFlag, void**, void**, void**, void>)&_allocMem;
+            callbacks[156] = (delegate* unmanaged<IntPtr, IntPtr*, byte, byte, uint, void>)&_reserveUnwindInfo;
+            callbacks[157] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, byte*, uint, uint, uint, byte*, CorJitFuncKind, void>)&_allocUnwindInfo;
+            callbacks[158] = (delegate* unmanaged<IntPtr, IntPtr*, UIntPtr, void*>)&_allocGCInfo;
+            callbacks[159] = (delegate* unmanaged<IntPtr, IntPtr*, uint, void>)&_setEHcount;
+            callbacks[160] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_EH_CLAUSE*, void>)&_setEHinfo;
+            callbacks[161] = (delegate* unmanaged<IntPtr, IntPtr*, uint, byte*, IntPtr, byte>)&_logMsg;
+            callbacks[162] = (delegate* unmanaged<IntPtr, IntPtr*, byte*, int, byte*, int>)&_doAssert;
+            callbacks[163] = (delegate* unmanaged<IntPtr, IntPtr*, CorJitResult, void>)&_reportFatalError;
+            callbacks[164] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema**, uint*, byte**, HRESULT>)&_getPgoInstrumentationResults;
+            callbacks[165] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, PgoInstrumentationSchema*, uint, byte**, HRESULT>)&_allocPgoInstrumentationBySchema;
+            callbacks[166] = (delegate* unmanaged<IntPtr, IntPtr*, CORINFO_METHOD_STRUCT_*, CORINFO_CLASS_STRUCT_*, uint, uint*, uint*, CORINFO_CLASS_STRUCT_*>)&_getLikelyClass;
+            callbacks[167] = (delegate* unmanaged<IntPtr, IntPtr*, uint, CORINFO_SIG_INFO*, CORINFO_METHOD_STRUCT_*, void>)&_recordCallSite;
+            callbacks[168] = (delegate* unmanaged<IntPtr, IntPtr*, void*, void*, ushort, ushort, int, void>)&_recordRelocation;
+            callbacks[169] = (delegate* unmanaged<IntPtr, IntPtr*, void*, ushort>)&_getRelocTypeHint;
+            callbacks[170] = (delegate* unmanaged<IntPtr, IntPtr*, uint>)&_getExpectedTargetArchitecture;
+            callbacks[171] = (delegate* unmanaged<IntPtr, IntPtr*, CORJIT_FLAGS*, uint, uint>)&_getJitFlags;
 
             return (IntPtr)callbacks;
         }
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
index 0f877f076bcb..227d5bd120c6 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
@@ -71,6 +71,9 @@ private enum CFI_OPCODE
         [DllImport(JitLibrary)]
         private extern static IntPtr jitStartup(IntPtr host);
 
+        [DllImport(JitLibrary)]
+        private extern static void jitShutdown([MarshalAs(UnmanagedType.I1)] bool processIsTerminating);
+
         [DllImport(JitLibrary)]
         private extern static IntPtr getJit();
 
@@ -123,6 +126,11 @@ public static void Startup()
             jitStartup(GetJitHost(JitConfigProvider.Instance.UnmanagedInstance));
         }
 
+        public static void Shutdown()
+        {
+            jitShutdown(true);
+        }
+
         public CorInfoImpl()
         {
             _jit = getJit();
@@ -3471,12 +3479,6 @@ private uint getJitFlags(ref CORJIT_FLAGS flags, uint sizeInBytes)
             return (uint)sizeof(CORJIT_FLAGS);
         }
 
-        private void* getLlvmModule()
-        {
-            return _llvmModuleHandle.ToPointer();
-        }
-
-
 #if READYTORUN
         InstructionSetFlags _actualInstructionSetSupported;
         InstructionSetFlags _actualInstructionSetUnsupported;
diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
index 61da6240b7af..d4640311e881 100644
--- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
+++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt
@@ -269,7 +269,6 @@ FUNCTIONS
     size_t findNameOfToken(CORINFO_MODULE_HANDLE       moduleHandle,mdToken                     token, char * szFQName,size_t FQNameCapacity);
     bool getSystemVAmd64PassStructInRegisterDescriptor(CORINFO_CLASS_HANDLE  structHnd, SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr);
     uint32_t getThreadTLSIndex(void                  **ppIndirection);
-    void* getLlvmModule();
     const void * getInlinedCallFrameVptr(void **ppIndirection);
     int32_t * getAddrOfCaptureThreadGlobal(void                  **ppIndirection);
     void* getHelperFtn (CorInfoHelpFunc         ftnNum, void                  **ppIndirection);
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 62f9ae02b441..b704ebbf46fb 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -63,8 +63,11 @@ protected override void CompileInternal(string outputFile, ObjectDumper dumper)
 
             var nodes = _dependencyGraph.MarkedNodeList;
 
-            Console.WriteLine($"RyuJIT compilation results, total methods {totalMethodCount} RyuJit Methods {ryuJitMethodCount} % {((decimal)ryuJitMethodCount * 100 / totalMethodCount):n4}");
             LLVMObjectWriter.EmitObject(outputFile, nodes, NodeFactory, this, dumper);
+
+            CorInfoImpl.Shutdown(); // writes the LLVM bitcode
+
+            Console.WriteLine($"RyuJIT compilation results, total methods {totalMethodCount} RyuJit Methods {ryuJitMethodCount} % {((decimal)ryuJitMethodCount * 100 / totalMethodCount):n4}");
         }
 
         protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCore<NodeFactory>> obj)
@@ -93,7 +96,12 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
 
         private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
         {
-            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this, Module.Handle));
+            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread =>
+            {
+                var impl = new CorInfoImpl(this);
+                impl.RegisterLlvmCallbacks();
+                return impl;
+            });
 
             foreach (LLVMMethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
             {
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/ILCompiler.RyuJit.csproj b/src/coreclr/tools/aot/ILCompiler.RyuJit/ILCompiler.RyuJit.csproj
index 865fcb71046e..60d8ef9d41cd 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/ILCompiler.RyuJit.csproj
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/ILCompiler.RyuJit.csproj
@@ -94,4 +94,8 @@
       <Link>Pgo\TypeSystemEntityOrUnknown.cs</Link>
     </Compile>
   </ItemGroup>
+
+  <ItemGroup>
+    <Compile Include="JitInterface\CorInfoImpl.Llvm.cs" />
+  </ItemGroup>
 </Project>
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
new file mode 100644
index 000000000000..fd672788da6e
--- /dev/null
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
@@ -0,0 +1,35 @@
+﻿using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using ILCompiler;
+using Internal.TypeSystem;
+
+namespace Internal.JitInterface
+{
+    public unsafe sealed partial class CorInfoImpl
+    {
+        [ThreadStatic]
+        private static CorInfoImpl _thisStatic;
+
+        [UnmanagedCallersOnly]
+        public static byte* getMangledMethodName(IntPtr thisHandle, CORINFO_METHOD_STRUCT_* ftn)
+        {
+            //var _this = GetThis(thisHandle); // TODO: this doesn't work, but how does it cope anyway with this being moved by the GC?
+
+            MethodDesc method = _thisStatic.HandleToObject(ftn);
+
+            return (byte*)_thisStatic.GetPin(_thisStatic._compilation.NameMangler.GetMangledMethodName(method).UnderlyingArray);
+        }
+
+        [DllImport(JitLibrary)]
+        private extern static void registerLlvmCallbacks(IntPtr thisHandle, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
+
+        public void RegisterLlvmCallbacks()
+        {
+            CorInfoImpl _this = this;
+            _thisStatic = this;
+
+            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
+        }
+    }
+}
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
index 4211b5cfb928..8c5a2ece1e58 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs
@@ -47,7 +47,6 @@ private struct SequencePoint
         private readonly UnboxingMethodDescFactory _unboxingThunkFactory = new UnboxingMethodDescFactory();
         private bool _isFallbackBodyCompilation;
         private DependencyList _additionalDependencies;
-        private IntPtr _llvmModuleHandle;
 
         public CorInfoImpl(RyuJitCompilation compilation)
             : this()
@@ -55,13 +54,6 @@ public CorInfoImpl(RyuJitCompilation compilation)
             _compilation = compilation;
         }
 
-        public CorInfoImpl(RyuJitCompilation compilation, IntPtr llvmModuleRef)
-            : this()
-        {
-            _compilation = compilation;
-            _llvmModuleHandle = llvmModuleRef;
-        }
-
         private MethodDesc getUnboxingThunk(MethodDesc method)
         {
             return _unboxingThunkFactory.GetUnboxingMethod(method);
diff --git a/src/coreclr/tools/aot/jitinterface/jitinterface.h b/src/coreclr/tools/aot/jitinterface/jitinterface.h
index ec1e2823e800..3e61333832c2 100644
--- a/src/coreclr/tools/aot/jitinterface/jitinterface.h
+++ b/src/coreclr/tools/aot/jitinterface/jitinterface.h
@@ -130,7 +130,6 @@ struct JitInterfaceCallbacks
     size_t (* findNameOfToken)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_MODULE_HANDLE moduleHandle, unsigned int token, char* szFQName, size_t FQNameCapacity);
     bool (* getSystemVAmd64PassStructInRegisterDescriptor)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_CLASS_HANDLE structHnd, SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR* structPassInRegDescPtr);
     uint32_t (* getThreadTLSIndex)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
-    void* (* getLlvmModule)(void * thisHandle, CorInfoExceptionClass** ppException);
     const void* (* getInlinedCallFrameVptr)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
     int32_t* (* getAddrOfCaptureThreadGlobal)(void * thisHandle, CorInfoExceptionClass** ppException, void** ppIndirection);
     void* (* getHelperFtn)(void * thisHandle, CorInfoExceptionClass** ppException, CorInfoHelpFunc ftnNum, void** ppIndirection);
@@ -1333,14 +1332,6 @@ class JitInterfaceWrapper : public ICorJitInfo
     return temp;
 }
 
-    virtual void* getLlvmModule()
-{
-    CorInfoExceptionClass* pException = nullptr;
-    void* temp = _callbacks->getLlvmModule(_thisHandle, &pException);
-    if (pException != nullptr) throw pException;
-    return temp;
-}
-
     virtual const void* getInlinedCallFrameVptr(
           void** ppIndirection)
 {

From c26328bb346002867b497e62c69a4cd63ce1e1a0 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sat, 6 Mar 2021 13:44:57 -0500
Subject: [PATCH 20/44] add loops for blocks and statements

---
 docs/workflow/building/coreclr/nativeaot.md   |  1 +
 src/coreclr/jit/compiler.cpp                  |  2 +
 src/coreclr/jit/jit.h                         |  6 ++-
 src/coreclr/jit/llvm.cpp                      | 52 ++++++++++++++++---
 src/coreclr/jit/llvm.h                        |  7 ++-
 .../tools/Common/JitInterface/CorInfoImpl.cs  |  6 ++-
 .../DependencyAnalysis/LLVMMethodCodeNode.cs  |  7 ++-
 .../Compiler/LLVMCodegenCompilation.cs        | 14 +++--
 .../JitInterface/CorInfoImpl.Llvm.cs          |  6 +--
 9 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index 9271f76e92b1..455b48fa416f 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -56,6 +56,7 @@ The workflow looks like this:
 - Open the ilc.sln solution described above. This solution contains the compiler, but also an unrelated project named "repro". This repro project is a small Hello World. You can place any piece of C# you would like to compile in it. Building the project will compile the source code into IL, but also generate a response file that is suitable to pass to the AOT compiler.
 - Make sure you set the solution configuration in VS to the configuration you just built (e.g. x64 Debug).
 - In the ILCompiler project properties, on the Debug tab, set the "Application arguments" to the generated response file. This will be a file such as "C:\runtimelab\artifacts\bin\repro\x64\Debug\compile-with-Release-libs.rsp". Prefix the path to the file with "@" to indicate this is a response file so that the "Application arguments" field looks like "@some\path\to\file.rsp".
+- For WebAssembly, edit the .rsp file and 
 - Build & run ILCompiler using **F5**. This will compile the repro project into an `.obj` file. You can debug the compiler and set breakpoints in it at this point.
 - The last step is linking the file into an executable so that we can launch the result of the AOT compilation.
 - Open the src\coreclr\tools\aot\ILCompiler\reproNative\reproNative.vcxproj project in Visual Studio. This project is configured to pick up the `.obj` file we just compiled and link it with the rest of the runtime.
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index ab93db20b1ca..9c30be1279f4 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -5629,6 +5629,8 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
     CORINFO_EE_INFO* eeInfo = eeGetEEInfo();
 #ifdef TARGET_UNIX
     info.compMatchedVM = info.compMatchedVM && (eeInfo->osType == CORINFO_UNIX);
+#elif TARGET_WASM
+    // TODO: do we need a CORINFO_WASM (or CORINFO_LLVM/CORINFO_BROWSER even though wasm can run outside the browser)
 #else
     info.compMatchedVM        = info.compMatchedVM && (eeInfo->osType == CORINFO_WINNT);
 #endif
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index f6323d816d63..5438697ad0ca 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -195,8 +195,10 @@
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARMNT
 #elif defined(TARGET_ARM64)
 #define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_ARM64 // 0xAA64
-#elif defined(TARGET_WASM)
-#define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_AMD64 // TODO: what is this?
+#elif defined(TARGET_WASM32)
+#define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_WASM32
+#elif defined(TARGET_WASM64)
+#define IMAGE_FILE_MACHINE_TARGET IMAGE_FILE_MACHINE_WASM64
 #else
 #error Unsupported or unset target architecture
 #endif
diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
index bb68bb3ae40b..32b847971d77 100644
--- a/src/coreclr/jit/llvm.cpp
+++ b/src/coreclr/jit/llvm.cpp
@@ -2,7 +2,10 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 #ifdef TARGET_WASM
+#include <string.h>
 #include "compiler.h"
+#include "block.h"
+#include "gentree.h"
 #include "llvm.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/LLVMContext.h"
@@ -22,11 +25,18 @@ static Module* _module;
 static LLVMContext _llvmContext;
 static void* _thisPtr;
 static const char* (*_getMangledMethodName)(void*, CORINFO_METHOD_STRUCT_*);
+static char* _outputFileName;
 
-extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
+extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
 {
     _thisPtr = thisPtr;
     _getMangledMethodName = getMangledMethodNamePtr;
+//    _outputFileName = getAllocator(CMK_DebugOnly).allocate<char>(strlen(outputFileName) + 1)
+    _outputFileName = (char*)malloc(strlen(outputFileName) + 7);
+    strcpy(_outputFileName, "1.txt"); // ??? without this _outputFileName is corrupted
+    strcpy(_outputFileName, outputFileName);
+    strcpy(_outputFileName + strlen(_outputFileName) - 4, "clrjit"); // use different module output name for now, TODO: delete if old LLVM gen does not create a module
+    strcat(_outputFileName, ".bc");
 }
 
 void Llvm::Init()
@@ -37,14 +47,27 @@ void Llvm::Init()
 void Llvm::llvmShutdown()
 {
 #if DEBUG
-    _module->dump();
-#endif //DEBUG
     std::error_code ec;
-    llvm::raw_fd_ostream OS("module.bc", ec);
+    char* txtFileName = (char *)malloc(strlen(_outputFileName) + 2); // .txt is longer than .bc
+    strcpy(txtFileName, _outputFileName);
+    strcpy(txtFileName + strlen(_outputFileName) - 2, "txt");
+    llvm::raw_fd_ostream textOutputStream(txtFileName, ec);
+    _module->print(textOutputStream, (llvm::AssemblyAnnotationWriter*)NULL);
+    free(txtFileName);
+#endif //DEBUG
+    llvm::raw_fd_ostream OS(_outputFileName, ec);
     llvm::WriteBitcodeToFile(*_module, OS);
 //    Module.Verify(LLVMVerifierFailureAction.LLVMAbortProcessAction);
+}
 
-    //Module.WriteBitcodeToFile(_objectFilePath);
+bool visitNode(llvm::IRBuilder<> &builder, GenTree* node)
+{
+    switch (node->gtOper)
+    {
+    default:
+        return false;
+    }
+    return true;
 }
 
 //------------------------------------------------------------------------
@@ -59,13 +82,26 @@ void Llvm::Compile(Compiler* pCompiler)
     {
         fatal(CORJIT_SKIPPED);
     }
-    // TODO: use of getMethodName is wrong as its only for debug purposes.
     const char* mangledName = (*_getMangledMethodName)(_thisPtr, info.compMethodHnd);
     Function* function = Function::Create(FunctionType::get(Type::getVoidTy(_llvmContext), ArrayRef<Type*>(), false), Function::InternalLinkage, 0U, mangledName, _module);
 
+    BasicBlock* firstBb = pCompiler->fgFirstBB;
     llvm::IRBuilder<> builder(_llvmContext);
-    llvm::BasicBlock* entry = llvm::BasicBlock::Create(_llvmContext, "", function);
-    builder.SetInsertPoint(entry);
+    for (BasicBlock* block = firstBb; block; block = block->bbNext)
+    {
+        llvm::BasicBlock* entry = llvm::BasicBlock::Create(_llvmContext, "", function);
+        builder.SetInsertPoint(entry);
+
+        for (Statement* stmt = block->bbStmtList; stmt; stmt = stmt->GetNextStmt())
+        {
+            if (!visitNode(builder, stmt->GetRootNode()))
+            {
+                // delete created function , dont want duplicate symbols
+                function->removeFromParent();
+                fatal(CORJIT_SKIPPED); // visitNode incomplete
+            }
+        }
+    }
     builder.CreateRetVoid();
 }
 #endif
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index dedaef2146dc..f71bb3729731 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -17,8 +17,13 @@
 #undef NumItems
 #ifdef TARGET_WASM
 
+#define IMAGE_FILE_MACHINE_WASM32             0xFFFF
+#define IMAGE_FILE_MACHINE_WASM64             0xFFFE // TODO: appropriate values for this?  Used to check compilation is for intended target
 
-extern "C" void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
+
+
+
+extern "C" void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
 
 class Llvm
 {
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
index 227d5bd120c6..b16b89b1f24d 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs
@@ -45,6 +45,8 @@ private enum ImageFileMachine
             AMD64 = 0x8664,
             ARM = 0x01c4,
             ARM64 = 0xaa64,
+            WASM32 = 0xffff, // matches llvm.h - TODO better to just #if out this check in compiler.cpp?
+            WASM64 = 0xfffe,
         }
         private enum CFI_OPCODE
         {
@@ -3388,7 +3390,9 @@ private uint getExpectedTargetArchitecture()
                 case TargetArchitecture.ARM64:
                     return (uint)ImageFileMachine.ARM64;
                 case TargetArchitecture.Wasm32:
-                    return (uint)ImageFileMachine.AMD64; // TODO
+                    return (uint)ImageFileMachine.WASM32;
+                case TargetArchitecture.Wasm64:
+                    return (uint)ImageFileMachine.WASM64;
                 default:
                     throw new NotImplementedException("Expected target architecture is not supported");
             }
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
index db622e4c3564..05843dfdc431 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/DependencyAnalysis/LLVMMethodCodeNode.cs
@@ -12,6 +12,8 @@ namespace ILCompiler.DependencyAnalysis
 {
     internal abstract class LLVMMethodCodeNode : DependencyNodeCore<NodeFactory>, IMethodCodeNode
     {
+        private bool _isStateMachineMoveNextMethod;
+
         protected readonly MethodDesc _method;
         protected DependencyList _dependencies;
 
@@ -91,7 +93,10 @@ public void InitializeNonRelocationDependencies(DependencyList additionalDepende
         {
         }
 
-        public void InitializeIsStateMachineMoveNextMethod(bool debugInfoIsStateMachineMoveNextMethod) => throw new System.NotImplementedException();
+        public void InitializeIsStateMachineMoveNextMethod(bool value)
+        {
+            _isStateMachineMoveNextMethod = value;
+        }
     }
 
     internal class LlvmMethodBodyNode : LLVMMethodCodeNode, IMethodBodyNode
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index b704ebbf46fb..839d63957428 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -20,7 +20,7 @@ namespace ILCompiler
     public sealed class LLVMCodegenCompilation : RyuJitCompilation
     {
         private readonly ConditionalWeakTable<Thread, CorInfoImpl> _corinfos = new ConditionalWeakTable<Thread, CorInfoImpl>();
-        // private CountdownEvent _compilationCountdown;
+        private string _outputFile;
 
         internal LLVMCodegenConfigProvider Options { get; }
         internal LLVMModuleRef Module { get; }
@@ -59,6 +59,7 @@ private static IEnumerable<ICompilationRootProvider> GetCompilationRoots(IEnumer
 
         protected override void CompileInternal(string outputFile, ObjectDumper dumper)
         {
+            _outputFile = outputFile;
             _dependencyGraph.ComputeMarkedNodes();
 
             var nodes = _dependencyGraph.MarkedNodeList;
@@ -99,7 +100,7 @@ private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
             CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread =>
             {
                 var impl = new CorInfoImpl(this);
-                impl.RegisterLlvmCallbacks();
+                impl.RegisterLlvmCallbacks(_outputFile);
                 return impl;
             });
 
@@ -125,8 +126,13 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
 
             try
             {
-                corInfo.CompileMethod(methodCodeNodeNeedingCode);
-                ryuJitMethodCount++;
+                var sig = method.Signature;
+                if (sig.Length == 0 && sig.ReturnType == TypeSystemContext.GetWellKnownType(WellKnownType.Void)) // speed up
+                {
+                    corInfo.CompileMethod(methodCodeNodeNeedingCode);
+                    ryuJitMethodCount++;
+                }
+                else ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
             }
             catch (CodeGenerationFailedException)
             {
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
index fd672788da6e..6bc3af55131b 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
@@ -22,14 +22,14 @@ public unsafe sealed partial class CorInfoImpl
         }
 
         [DllImport(JitLibrary)]
-        private extern static void registerLlvmCallbacks(IntPtr thisHandle, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
+        private extern static void registerLlvmCallbacks(IntPtr thisHandle, byte* outputFileName, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
 
-        public void RegisterLlvmCallbacks()
+        public void RegisterLlvmCallbacks(string outputFileName)
         {
             CorInfoImpl _this = this;
             _thisStatic = this;
 
-            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
+            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (byte*)_thisStatic.GetPin(StringToUTF8(outputFileName)), (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
         }
     }
 }

From d4340cce0389a73a2858fd50652e58476ee6a596 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 7 Mar 2021 17:58:24 -0500
Subject: [PATCH 21/44] first method compiles

---
 src/coreclr/jit/llvm.cpp                      | 41 ++++++++++++++-----
 src/coreclr/jit/llvm.h                        |  2 +-
 .../Compiler/LLVMCodegenCompilation.cs        |  7 +++-
 .../Compiler/LLVMCodegenCompilationBuilder.cs | 20 +++++++++
 .../JitInterface/CorInfoImpl.Llvm.cs          |  9 ++--
 5 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
index 32b847971d77..e8d6a0f970bf 100644
--- a/src/coreclr/jit/llvm.cpp
+++ b/src/coreclr/jit/llvm.cpp
@@ -26,22 +26,26 @@ static LLVMContext _llvmContext;
 static void* _thisPtr;
 static const char* (*_getMangledMethodName)(void*, CORINFO_METHOD_STRUCT_*);
 static char* _outputFileName;
+static Function* _doNothingFunction;
 
-extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
+extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* triple, const char* dataLayout, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
 {
     _thisPtr = thisPtr;
     _getMangledMethodName = getMangledMethodNamePtr;
+    _module = new Module(llvm::StringRef("netscripten"), _llvmContext);
+    _module->setTargetTriple(triple);
+    _module->setDataLayout(dataLayout);
+
 //    _outputFileName = getAllocator(CMK_DebugOnly).allocate<char>(strlen(outputFileName) + 1)
     _outputFileName = (char*)malloc(strlen(outputFileName) + 7);
     strcpy(_outputFileName, "1.txt"); // ??? without this _outputFileName is corrupted
     strcpy(_outputFileName, outputFileName);
-    strcpy(_outputFileName + strlen(_outputFileName) - 4, "clrjit"); // use different module output name for now, TODO: delete if old LLVM gen does not create a module
+    strcpy(_outputFileName + strlen(_outputFileName) - 3, "clrjit"); // use different module output name for now, TODO: delete if old LLVM gen does not create a module
     strcat(_outputFileName, ".bc");
 }
 
 void Llvm::Init()
 {
-    _module = new Module(llvm::StringRef("netscripten-clrjit"), _llvmContext);
 }
 
 void Llvm::llvmShutdown()
@@ -60,12 +64,29 @@ void Llvm::llvmShutdown()
 //    Module.Verify(LLVMVerifierFailureAction.LLVMAbortProcessAction);
 }
 
+void EmitDoNothingCall(llvm::IRBuilder<>& builder)
+{
+    if (_doNothingFunction == nullptr)
+    {
+        _doNothingFunction = Function::Create(FunctionType::get(Type::getVoidTy(_llvmContext), ArrayRef<Type*>(), false), Function::ExternalLinkage, 0U, "llvm.donothing", _module);
+    }
+    builder.CreateCall(_doNothingFunction);
+}
+
 bool visitNode(llvm::IRBuilder<> &builder, GenTree* node)
 {
-    switch (node->gtOper)
+    switch (node->OperGet())
     {
-    default:
-        return false;
+        case GT_IL_OFFSET:
+            break;
+        case GT_NO_OP:
+            EmitDoNothingCall(builder);
+            break;
+        case GT_RETURN:
+            builder.CreateRetVoid();
+            break;
+        default:
+             return false;
     }
     return true;
 }
@@ -91,10 +112,11 @@ void Llvm::Compile(Compiler* pCompiler)
     {
         llvm::BasicBlock* entry = llvm::BasicBlock::Create(_llvmContext, "", function);
         builder.SetInsertPoint(entry);
-
-        for (Statement* stmt = block->bbStmtList; stmt; stmt = stmt->GetNextStmt())
+  //      GenTree* firstGt = block->GetFirstLIRNode();
+//        firstGt->VisitOperands();
+        for (GenTree* node = block->GetFirstLIRNode(); node; node = node->gtNext)
         {
-            if (!visitNode(builder, stmt->GetRootNode()))
+            if (!visitNode(builder, node))
             {
                 // delete created function , dont want duplicate symbols
                 function->removeFromParent();
@@ -102,6 +124,5 @@ void Llvm::Compile(Compiler* pCompiler)
             }
         }
     }
-    builder.CreateRetVoid();
 }
 #endif
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index f71bb3729731..c593ffe04356 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -23,7 +23,7 @@
 
 
 
-extern "C" void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
+extern "C" void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* triple, const char* dataLayout, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
 
 class Llvm
 {
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 839d63957428..225eef26e81f 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -100,7 +100,7 @@ private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
             CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread =>
             {
                 var impl = new CorInfoImpl(this);
-                impl.RegisterLlvmCallbacks(_outputFile);
+                impl.RegisterLlvmCallbacks(_outputFile, Module.Target, Module.DataLayout);
                 return impl;
             });
 
@@ -127,9 +127,12 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
             try
             {
                 var sig = method.Signature;
-                if (sig.Length == 0 && sig.ReturnType == TypeSystemContext.GetWellKnownType(WellKnownType.Void)) // speed up
+                if (sig.Length == 0 && sig.ReturnType == TypeSystemContext.GetWellKnownType(WellKnownType.Void) &&
+                    sig.IsStatic && method.Name == "Initialize" && method.OwningType.ToString().Contains("GCStress")) // speed up
                 {
                     corInfo.CompileMethod(methodCodeNodeNeedingCode);
+                    methodCodeNodeNeedingCode.CompilationCompleted = true;
+                    methodCodeNodeNeedingCode.SetDependencies(new DependencyNodeCore<NodeFactory>.DependencyList()); // TODO: how to track - check RyuJITCompilation
                     ryuJitMethodCount++;
                 }
                 else ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
index 2102d1da524b..92ca57d853b2 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilationBuilder.cs
@@ -28,6 +28,26 @@ public LLVMCodegenCompilationBuilder(CompilerTypeSystemContext context, Compilat
         public override CompilationBuilder UseBackendOptions(IEnumerable<string> options)
         {
             _config = new LLVMCodegenConfigProvider(options);
+            var builder = new ArrayBuilder<KeyValuePair<string, string>>();
+
+            foreach (string param in options)
+            {
+                int indexOfEquals = param.IndexOf('=');
+
+                // We're skipping bad parameters without reporting.
+                // This is not a mainstream feature that would need to be friendly.
+                // Besides, to really validate this, we would also need to check that the config name is known.
+                if (indexOfEquals < 1)
+                    continue;
+
+                string name = param.Substring(0, indexOfEquals);
+                string value = param.Substring(indexOfEquals + 1);
+
+                builder.Add(new KeyValuePair<string, string>(name, value));
+            }
+
+            _ryujitOptions = builder.ToArray();
+
             return this;
         }
 
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
index 6bc3af55131b..b4bcf103b52c 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
@@ -22,14 +22,17 @@ public unsafe sealed partial class CorInfoImpl
         }
 
         [DllImport(JitLibrary)]
-        private extern static void registerLlvmCallbacks(IntPtr thisHandle, byte* outputFileName, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
+        private extern static void registerLlvmCallbacks(IntPtr thisHandle, byte* outputFileName, byte* triple, byte* dataLayout, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
 
-        public void RegisterLlvmCallbacks(string outputFileName)
+        public void RegisterLlvmCallbacks(string outputFileName, string triple, string dataLayout)
         {
             CorInfoImpl _this = this;
             _thisStatic = this;
 
-            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (byte*)_thisStatic.GetPin(StringToUTF8(outputFileName)), (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
+            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (byte*)_thisStatic.GetPin(StringToUTF8(outputFileName)),
+                (byte*)_thisStatic.GetPin(StringToUTF8(triple)),
+                (byte*)_thisStatic.GetPin(StringToUTF8(dataLayout)),
+                (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
         }
     }
 }

From 812e6ce487e34961cd265a11d988adfdc48d49ed Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 8 Mar 2021 19:08:25 -0500
Subject: [PATCH 22/44] creates 8 methods!

---
 src/coreclr/jit/llvm.cpp                      | 37 ++++++---
 .../CodeGen/ILToLLVMImporter.cs               | 78 ++++---------------
 .../CodeGen/LLVMObjectWriter.cs               |  4 +
 .../Compiler/LLVMCodegenCompilation.cs        | 60 +++++++++++++-
 4 files changed, 104 insertions(+), 75 deletions(-)

diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
index e8d6a0f970bf..779a39fddb23 100644
--- a/src/coreclr/jit/llvm.cpp
+++ b/src/coreclr/jit/llvm.cpp
@@ -28,11 +28,13 @@ static const char* (*_getMangledMethodName)(void*, CORINFO_METHOD_STRUCT_*);
 static char* _outputFileName;
 static Function* _doNothingFunction;
 
+Compiler::Info _info;
+
 extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* triple, const char* dataLayout, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
 {
     _thisPtr = thisPtr;
     _getMangledMethodName = getMangledMethodNamePtr;
-    _module = new Module(llvm::StringRef("netscripten"), _llvmContext);
+    _module = new Module(llvm::StringRef("netscripten-clrjit"), _llvmContext);
     _module->setTargetTriple(triple);
     _module->setDataLayout(dataLayout);
 
@@ -61,9 +63,21 @@ void Llvm::llvmShutdown()
 #endif //DEBUG
     llvm::raw_fd_ostream OS(_outputFileName, ec);
     llvm::WriteBitcodeToFile(*_module, OS);
+    //_module->end();
+    delete _module;
 //    Module.Verify(LLVMVerifierFailureAction.LLVMAbortProcessAction);
 }
 
+FunctionType* GetFunctionTypeForMethod(Compiler::Info info)
+{
+    if (info.compArgsCount != 0 || info.compRetType != TYP_VOID)
+    {
+        fatal(CORJIT_SKIPPED);
+    }
+    // all functions have shadow stack as first arg (i8*)
+    return FunctionType::get(Type::getVoidTy(_llvmContext), ArrayRef<Type*>(Type::getInt8PtrTy(_llvmContext)), false);
+}
+
 void EmitDoNothingCall(llvm::IRBuilder<>& builder)
 {
     if (_doNothingFunction == nullptr)
@@ -96,20 +110,22 @@ bool visitNode(llvm::IRBuilder<> &builder, GenTree* node)
 //
 void Llvm::Compile(Compiler* pCompiler)
 {
-    Compiler::Info info = pCompiler->info;
+    _info = pCompiler->info;
 
-    //TODO: delete
-    if (info.compArgsCount != 0 || info.compRetType != TYP_VOID)
-    {
-        fatal(CORJIT_SKIPPED);
-    }
-    const char* mangledName = (*_getMangledMethodName)(_thisPtr, info.compMethodHnd);
-    Function* function = Function::Create(FunctionType::get(Type::getVoidTy(_llvmContext), ArrayRef<Type*>(), false), Function::InternalLinkage, 0U, mangledName, _module);
+    const char* mangledName = (*_getMangledMethodName)(_thisPtr, _info.compMethodHnd);
+    Function* function = Function::Create(GetFunctionTypeForMethod(_info), Function::ExternalLinkage, 0U, mangledName, _module); // TODO: ExternalLinkage forced as linked from old module
 
     BasicBlock* firstBb = pCompiler->fgFirstBB;
     llvm::IRBuilder<> builder(_llvmContext);
     for (BasicBlock* block = firstBb; block; block = block->bbNext)
     {
+        if (block->hasTryIndex())
+        {
+            function->dropAllReferences();
+            function->eraseFromParent();
+            fatal(CORJIT_SKIPPED); // TODO: skip anything with a try block for now
+        }
+
         llvm::BasicBlock* entry = llvm::BasicBlock::Create(_llvmContext, "", function);
         builder.SetInsertPoint(entry);
   //      GenTree* firstGt = block->GetFirstLIRNode();
@@ -119,7 +135,8 @@ void Llvm::Compile(Compiler* pCompiler)
             if (!visitNode(builder, node))
             {
                 // delete created function , dont want duplicate symbols
-                function->removeFromParent();
+                function->dropAllReferences();
+                function->eraseFromParent();
                 fatal(CORJIT_SKIPPED); // visitNode incomplete
             }
         }
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
index 891f236ada8e..bf42edf021cc 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
@@ -348,7 +348,7 @@ private void GenerateProlog()
             // Keep track of where we are in the llvm signature, starting after the
             // shadow stack pointer and return address
             int signatureIndex = 1;
-            if (NeedsReturnStackSlot(_signature))
+            if (_compilation.NeedsReturnStackSlot(_signature))
             {
                 signatureIndex++;
             }
@@ -510,7 +510,7 @@ private void GenerateProlog()
 
         private LLVMValueRef CreateLLVMFunction(string mangledName, MethodSignature signature, bool hasHiddenParameter)
         {
-            return Module.AddFunction(mangledName, GetLLVMSignatureForMethod(signature, hasHiddenParameter));
+            return Module.AddFunction(mangledName, _compilation.GetLLVMSignatureForMethod(signature, hasHiddenParameter));
         }
 
         private LLVMValueRef GetOrCreateLLVMFunction(string mangledName, MethodSignature signature, bool hasHiddenParam)
@@ -1653,7 +1653,7 @@ private bool CanStoreVariableOnStack(TypeDesc variableType)
         /// Returns true if the type can be stored on the local stack
         /// instead of the shadow stack in this method.
         /// </summary>
-        private static bool CanStoreTypeOnStack(TypeDesc type)
+        internal static bool CanStoreTypeOnStack(TypeDesc type)
         {
             if (type is DefType defType)
             {
@@ -1670,15 +1670,6 @@ private static bool CanStoreTypeOnStack(TypeDesc type)
             return false;
         }
 
-        /// <summary>
-        /// Returns true if the method returns a type that must be kept
-        /// on the shadow stack
-        /// </summary>
-        private static bool NeedsReturnStackSlot(MethodSignature signature)
-        {
-            return !signature.ReturnType.IsVoid && !CanStoreTypeOnStack(signature.ReturnType);
-        }
-
         private int GetTotalParameterOffset()
         {
             int offset = 0;
@@ -1922,7 +1913,7 @@ private void ImportReturn()
             LLVMTypeRef valueType = GetLLVMTypeForTypeDesc(_signature.ReturnType);
             LLVMValueRef castValue = retVal.ValueAsType(valueType, _builder);
 
-            if (NeedsReturnStackSlot(_signature))
+            if (_compilation.NeedsReturnStackSlot(_signature))
             {
                 var retParam = _llvmFunction.GetParam(1);
                 ImportStoreHelper(castValue, valueType, retParam, 0);
@@ -2277,7 +2268,7 @@ private LLVMValueRef GetCallableVirtualMethod(LLVMValueRef thisPointer, MethodDe
 
             LLVMValueRef slot = GetOrCreateMethodSlot(runtimeDeterminedMethod, callee);
 
-            LLVMTypeRef llvmSignature = GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, false);
+            LLVMTypeRef llvmSignature = _compilation.GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, false);
             LLVMValueRef functionPtr;
             ThrowIfNull(thisPointer);
             if (runtimeDeterminedMethod.OwningType.IsInterface)
@@ -2387,51 +2378,10 @@ private LLVMValueRef GetCallableGenericVirtualMethod(StackEntry objectPtr, Metho
                 new LLVMBasicBlockRef[] { fatBranch, notFatBranch }, 2);
 
             // dont know the type for sure, but will generate for no hidden dict param and change if necessary before calling.
-            var asFunc = CastIfNecessary(_builder, loadPtr, LLVMTypeRef.CreatePointer(GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, false), 0) , "castToFunc");
+            var asFunc = CastIfNecessary(_builder, loadPtr, LLVMTypeRef.CreatePointer(_compilation.GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, false), 0) , "castToFunc");
             return asFunc;
         }
 
-        private LLVMTypeRef GetLLVMSignatureForMethod(MethodSignature signature, bool hasHiddenParam)
-        {
-            TypeDesc returnType = signature.ReturnType;
-            LLVMTypeRef llvmReturnType;
-            bool returnOnStack = false;
-            if (!NeedsReturnStackSlot(signature))
-            {
-                returnOnStack = true;
-                llvmReturnType = GetLLVMTypeForTypeDesc(returnType);
-            }
-            else
-            {
-                llvmReturnType = LLVMTypeRef.Void;
-            }
-
-            List<LLVMTypeRef> signatureTypes = new List<LLVMTypeRef>();
-            signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0)); // Shadow stack pointer
-
-            if (!returnOnStack && returnType != GetWellKnownType(WellKnownType.Void))
-            {
-                signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0));
-            }
-
-            if (hasHiddenParam)
-            {
-                signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0)); // *EEType
-            }
-
-            // Intentionally skipping the 'this' pointer since it could always be a GC reference
-            // and thus must be on the shadow stack
-            foreach (TypeDesc type in signature)
-            {
-                if (CanStoreTypeOnStack(type))
-                {
-                    signatureTypes.Add(GetLLVMTypeForTypeDesc(type));
-                }
-            }
-
-            return LLVMTypeRef.CreateFunction(llvmReturnType, signatureTypes.ToArray(), false);
-        }
-
         private ExpressionEntry AllocateObject(StackEntry eeType, TypeDesc forcedReturnType = null)
         {
             //TODO: call GetNewObjectHelperForType from JitHelper.cs (needs refactoring)
@@ -2771,7 +2721,7 @@ private bool ImportIntrinsicCall(MethodDesc method, MethodDesc runtimeDetermined
 
             TypeDesc returnType = signature.ReturnType;
 
-            bool needsReturnSlot = NeedsReturnStackSlot(signature);
+            bool needsReturnSlot = _compilation.NeedsReturnStackSlot(signature);
             SpilledExpressionEntry returnSlot = null;
             var actualReturnType = forcedReturnType ?? returnType;
             if (needsReturnSlot)
@@ -2831,7 +2781,7 @@ private bool ImportIntrinsicCall(MethodDesc method, MethodDesc runtimeDetermined
                     {
                         if (_isUnboxingThunk && _method.RequiresInstArg())
                         {
-                            hiddenParam = _currentFunclet.GetParam((uint)(1 + (NeedsReturnStackSlot(_signature) ? 1 : 0)));
+                            hiddenParam = _currentFunclet.GetParam((uint)(1 + (_compilation.NeedsReturnStackSlot(_signature) ? 1 : 0)));
                         }
                         else if (canonMethod.RequiresInstMethodDescArg())
                         {
@@ -2921,7 +2871,7 @@ private bool ImportIntrinsicCall(MethodDesc method, MethodDesc runtimeDetermined
 
                 // else
                 builder.PositionAtEnd(fatBranch);
-                var fnWithDict = builder.BuildCast(LLVMOpcode.LLVMBitCast, fn, LLVMTypeRef.CreatePointer(GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, true), 0), "fnWithDict");
+                var fnWithDict = builder.BuildCast(LLVMOpcode.LLVMBitCast, fn, LLVMTypeRef.CreatePointer(_compilation.GetLLVMSignatureForMethod(runtimeDeterminedMethod.Signature, true), 0), "fnWithDict");
                 var dictDereffed = builder.BuildLoad(builder.BuildLoad( dict, "l1"), "l2");
                 llvmArgs.Insert(needsReturnSlot ? 2 : 1, dictDereffed);
                 LLVMValueRef fatReturn = CallOrInvoke(fromLandingPad, builder, currentTryRegion, fnWithDict, llvmArgs.ToArray(), ref nextInstrBlock);
@@ -3411,7 +3361,7 @@ private void EmitNativeToManagedThunk(LLVMCodegenCompilation compilation, Method
             List<LLVMValueRef> llvmArgs = new List<LLVMValueRef>();
             llvmArgs.Add(calleeFrame);
 
-            bool needsReturnSlot = NeedsReturnStackSlot(method.Signature);
+            bool needsReturnSlot = _compilation.NeedsReturnStackSlot(method.Signature);
 
             if (needsReturnSlot)
             {
@@ -3465,8 +3415,8 @@ private void ImportCalli(int token)
         {
             MethodSignature methodSignature = (MethodSignature)_canonMethodIL.GetObject(token);
 
-            var noHiddenParamSig = GetLLVMSignatureForMethod(methodSignature, false);
-            var hddenParamSig = GetLLVMSignatureForMethod(methodSignature, true);
+            var noHiddenParamSig = _compilation.GetLLVMSignatureForMethod(methodSignature, false);
+            var hddenParamSig = _compilation.GetLLVMSignatureForMethod(methodSignature, true);
             var target = ((ExpressionEntry)_stack.Pop()).ValueAsType(LLVMTypeRef.CreatePointer(noHiddenParamSig, 0), _builder);
 
             var functionPtrAsInt = _builder.BuildPtrToInt(target, LLVMTypeRef.Int32, "ptrToInt");
@@ -5252,7 +5202,7 @@ LLVMValueRef GetGenericContext()
 
         uint GetHiddenContextParamNo()
         {
-            return 1 + (NeedsReturnStackSlot(_method.Signature) ? (uint)1 : 0);
+            return 1 + (_compilation.NeedsReturnStackSlot(_method.Signature) ? (uint)1 : 0);
         }
 
         bool FuncletsRequireHiddenContext()
@@ -5478,7 +5428,7 @@ private TypeDesc ResolveTypeToken(int token)
 
         private TypeDesc GetWellKnownType(WellKnownType wellKnownType)
         {
-            return _compilation.TypeSystemContext.GetWellKnownType(wellKnownType);
+            return _compilation.GetWellKnownType(wellKnownType);
         }
 
         private void ReportInvalidBranchTarget(int targetOffset)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/LLVMObjectWriter.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/LLVMObjectWriter.cs
index 54401a6eaa60..acb54ef4e08f 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/LLVMObjectWriter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/LLVMObjectWriter.cs
@@ -308,6 +308,10 @@ public SymbolRefData(bool isFunction, string symbolName, uint offset)
 
             public LLVMValueRef ToLLVMValueRef(LLVMModuleRef module)
             {
+                if (SymbolName == "S_P_CoreLib_System_Diagnostics_Tracing_EventListener___cctor")
+                {
+
+                }
                 LLVMValueRef valRef = IsFunction ? module.GetNamedFunction(SymbolName) : module.GetNamedGlobal(SymbolName);
 
                 if (Offset != 0 && valRef.Handle != IntPtr.Zero)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index 225eef26e81f..df1e189bb97e 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -128,11 +128,14 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
             {
                 var sig = method.Signature;
                 if (sig.Length == 0 && sig.ReturnType == TypeSystemContext.GetWellKnownType(WellKnownType.Void) &&
-                    sig.IsStatic && method.Name == "Initialize" && method.OwningType.ToString().Contains("GCStress")) // speed up
+                    sig.IsStatic) // speed up
                 {
                     corInfo.CompileMethod(methodCodeNodeNeedingCode);
                     methodCodeNodeNeedingCode.CompilationCompleted = true;
                     methodCodeNodeNeedingCode.SetDependencies(new DependencyNodeCore<NodeFactory>.DependencyList()); // TODO: how to track - check RyuJITCompilation
+                    // TODO: delete this external function when old module is gone
+                    LLVMValueRef externFunc = Module.AddFunction(NodeFactory.NameMangler.GetMangledMethodName(method).ToString(), GetLLVMSignatureForMethod(sig, method.RequiresInstArg()));
+                    externFunc.Linkage = LLVMLinkage.LLVMExternalLinkage;
                     ryuJitMethodCount++;
                 }
                 else ILImporter.CompileMethod(this, methodCodeNodeNeedingCode);
@@ -183,5 +186,60 @@ public TypeDesc ConvertToCanonFormIfNecessary(TypeDesc type, CanonicalFormKind p
 
             return type.ConvertToCanonForm(policy);
         }
+
+        public LLVMTypeRef GetLLVMSignatureForMethod(MethodSignature signature, bool hasHiddenParam)
+        {
+            TypeDesc returnType = signature.ReturnType;
+            LLVMTypeRef llvmReturnType;
+            bool returnOnStack = false;
+            if (!NeedsReturnStackSlot(signature))
+            {
+                returnOnStack = true;
+                llvmReturnType = ILImporter.GetLLVMTypeForTypeDesc(returnType);
+            }
+            else
+            {
+                llvmReturnType = LLVMTypeRef.Void;
+            }
+
+            List<LLVMTypeRef> signatureTypes = new List<LLVMTypeRef>();
+            signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0)); // Shadow stack pointer
+
+            if (!returnOnStack && returnType != GetWellKnownType(WellKnownType.Void))
+            {
+                signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0));
+            }
+
+            if (hasHiddenParam)
+            {
+                signatureTypes.Add(LLVMTypeRef.CreatePointer(LLVMTypeRef.Int8, 0)); // *EEType
+            }
+
+            // Intentionally skipping the 'this' pointer since it could always be a GC reference
+            // and thus must be on the shadow stack
+            foreach (TypeDesc type in signature)
+            {
+                if (ILImporter.CanStoreTypeOnStack(type))
+                {
+                    signatureTypes.Add(ILImporter.GetLLVMTypeForTypeDesc(type));
+                }
+            }
+
+            return LLVMTypeRef.CreateFunction(llvmReturnType, signatureTypes.ToArray(), false);
+        }
+
+        /// <summary>
+        /// Returns true if the method returns a type that must be kept
+        /// on the shadow stack
+        /// </summary>
+        public bool NeedsReturnStackSlot(MethodSignature signature)
+        {
+            return !signature.ReturnType.IsVoid && !ILImporter.CanStoreTypeOnStack(signature.ReturnType);
+        }
+
+        public TypeDesc GetWellKnownType(WellKnownType wellKnownType)
+        {
+            return TypeSystemContext.GetWellKnownType(wellKnownType);
+        }
     }
 }

From 538617e289abd4fa77d0ed7283874d7e07c826cf Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Wed, 10 Mar 2021 20:40:48 -0500
Subject: [PATCH 23/44] add min/max to clr_std/utility

---
 src/coreclr/inc/clr_std/utility | 16 ++++++++++++++++
 src/coreclr/jit/CMakeLists.txt  | 10 +++++-----
 src/coreclr/jit/compiler.cpp    | 28 ++++++----------------------
 src/coreclr/jit/llvm.h          |  2 +-
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/coreclr/inc/clr_std/utility b/src/coreclr/inc/clr_std/utility
index 1b6b5a7b72c1..4f72ab46e32a 100644
--- a/src/coreclr/inc/clr_std/utility
+++ b/src/coreclr/inc/clr_std/utility
@@ -49,6 +49,22 @@ namespace std
     {   // forward _Arg, given explicitly specified type parameter
         return ((T&&)_Arg);
     }
+
+    template <class T>
+    constexpr const T&
+    (min)(const T& _Left, const T& _Right)
+    {
+        // return smaller of _Left and _Right
+        return _Right < _Left ? _Right : _Left;
+    }
+
+    template <class T>
+    constexpr const T&
+    (max)(const T& _Left, const T& _Right)
+    {
+        // return larger of _Left and _Right
+        return _Left < _Right ? _Right : _Left;
+    }
 }
 
 namespace std
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index f3695da617b1..1a45b95851e7 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -548,17 +548,17 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  if (NOT CLR_CMAKE_HOST_UNIX)
-    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
-    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
-    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-  endif (NOT CLR_CMAKE_HOST_UNIX)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
   endif(CLR_CMAKE_TARGET_UNIX)
 endif (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
 
+if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
+    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+endif (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
 
 if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_PGO_INSTRUMENT)
   # Copy PGO dependency to target dir
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9c30be1279f4..cca45918f4dc 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -23,6 +23,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "jittelemetry.h"
 #include "patchpointinfo.h"
 #include "jitstd/algorithm.h"
+
+#undef min
+#undef max
+
 #if defined(TARGET_WASM)
 #include "llvm.h"
 #else
@@ -6655,11 +6659,7 @@ void Compiler::compInitVarScopeMap()
     compVarScopeMap = new (getAllocator()) VarNumToScopeDscMap(getAllocator());
 
     // 599 prime to limit huge allocations; for ex: duplicated scopes on single var.
-#ifdef TARGET_WASM
     compVarScopeMap->Reallocate(std::min(info.compVarScopesCount, 599U));
-#else
-    compVarScopeMap->Reallocate(min(info.compVarScopesCount, 599U));
-#endif
 
     for (unsigned i = 0; i < info.compVarScopesCount; ++i)
     {
@@ -7784,24 +7784,16 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
 
         // Update the totals and maxima.
         m_total.m_byteCodeBytes += info.m_byteCodeBytes;
-#ifdef TARGET_WASM
         m_maximum.m_byteCodeBytes = std::max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
-#else
-        m_maximum.m_byteCodeBytes = max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
-#endif
         m_total.m_totalCycles += info.m_totalCycles;
-#ifdef TARGET_WASM
         m_maximum.m_totalCycles = std::max(m_maximum.m_totalCycles, info.m_totalCycles);
-#else
-        m_maximum.m_totalCycles = max(m_maximum.m_totalCycles, info.m_totalCycles);
-#endif
 
 #if MEASURE_CLRAPI_CALLS
         // Update the CLR-API values.
         m_total.m_allClrAPIcalls += info.m_allClrAPIcalls;
-        m_maximum.m_allClrAPIcalls = max(m_maximum.m_allClrAPIcalls, info.m_allClrAPIcalls);
+        m_maximum.m_allClrAPIcalls = std::max(m_maximum.m_allClrAPIcalls, info.m_allClrAPIcalls);
         m_total.m_allClrAPIcycles += info.m_allClrAPIcycles;
-        m_maximum.m_allClrAPIcycles = max(m_maximum.m_allClrAPIcycles, info.m_allClrAPIcycles);
+        m_maximum.m_allClrAPIcycles = std::max(m_maximum.m_allClrAPIcycles, info.m_allClrAPIcycles);
 #endif
 
         if (includeInFiltered)
@@ -7831,22 +7823,14 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
                 m_filtered.m_CLRcyclesByPhase[i] += info.m_CLRcyclesByPhase[i];
 #endif
             }
-#ifdef TARGET_WASM
             m_maximum.m_cyclesByPhase[i] = std::max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
-#else
-            m_maximum.m_cyclesByPhase[i] = max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
-#endif
 
 #if MEASURE_CLRAPI_CALLS
             m_maximum.m_CLRcyclesByPhase[i] = max(m_maximum.m_CLRcyclesByPhase[i], info.m_CLRcyclesByPhase[i]);
 #endif
         }
         m_total.m_parentPhaseEndSlop += info.m_parentPhaseEndSlop;
-#ifdef TARGET_WASM
         m_maximum.m_parentPhaseEndSlop = std::max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
-#else
-        m_maximum.m_parentPhaseEndSlop = max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
-#endif
     }
 #if MEASURE_CLRAPI_CALLS
     else
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index c593ffe04356..d38dd25df818 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -10,7 +10,7 @@
 #include "jitpch.h"
 #include <new>
 
-// these break std::min/max
+// these break std::min/max in LLVM's headers
 #undef min
 #undef max
 // this breaks StringMap.h

From 3cf6e3814061c1d94d27306f834bc8a54ca6cd8c Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Wed, 10 Mar 2021 20:42:46 -0500
Subject: [PATCH 24/44] add min/max to clr_std/utility

---
 src/coreclr/inc/clr_std/utility | 16 ++++++++++++++++
 src/coreclr/jit/CMakeLists.txt  | 10 +++++-----
 src/coreclr/jit/compiler.cpp    | 28 ++++++----------------------
 src/coreclr/jit/llvm.h          |  2 +-
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/coreclr/inc/clr_std/utility b/src/coreclr/inc/clr_std/utility
index 1b6b5a7b72c1..4f72ab46e32a 100644
--- a/src/coreclr/inc/clr_std/utility
+++ b/src/coreclr/inc/clr_std/utility
@@ -49,6 +49,22 @@ namespace std
     {   // forward _Arg, given explicitly specified type parameter
         return ((T&&)_Arg);
     }
+
+    template <class T>
+    constexpr const T&
+    (min)(const T& _Left, const T& _Right)
+    {
+        // return smaller of _Left and _Right
+        return _Right < _Left ? _Right : _Left;
+    }
+
+    template <class T>
+    constexpr const T&
+    (max)(const T& _Left, const T& _Right)
+    {
+        // return larger of _Left and _Right
+        return _Left < _Right ? _Right : _Left;
+    }
 }
 
 namespace std
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index f3695da617b1..1a45b95851e7 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -548,17 +548,17 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  if (NOT CLR_CMAKE_HOST_UNIX)
-    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
-    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
-    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-  endif (NOT CLR_CMAKE_HOST_UNIX)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
   endif(CLR_CMAKE_TARGET_UNIX)
 endif (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
 
+if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
+    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+endif (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
 
 if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_PGO_INSTRUMENT)
   # Copy PGO dependency to target dir
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9c30be1279f4..cca45918f4dc 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -23,6 +23,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "jittelemetry.h"
 #include "patchpointinfo.h"
 #include "jitstd/algorithm.h"
+
+#undef min
+#undef max
+
 #if defined(TARGET_WASM)
 #include "llvm.h"
 #else
@@ -6655,11 +6659,7 @@ void Compiler::compInitVarScopeMap()
     compVarScopeMap = new (getAllocator()) VarNumToScopeDscMap(getAllocator());
 
     // 599 prime to limit huge allocations; for ex: duplicated scopes on single var.
-#ifdef TARGET_WASM
     compVarScopeMap->Reallocate(std::min(info.compVarScopesCount, 599U));
-#else
-    compVarScopeMap->Reallocate(min(info.compVarScopesCount, 599U));
-#endif
 
     for (unsigned i = 0; i < info.compVarScopesCount; ++i)
     {
@@ -7784,24 +7784,16 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
 
         // Update the totals and maxima.
         m_total.m_byteCodeBytes += info.m_byteCodeBytes;
-#ifdef TARGET_WASM
         m_maximum.m_byteCodeBytes = std::max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
-#else
-        m_maximum.m_byteCodeBytes = max(m_maximum.m_byteCodeBytes, info.m_byteCodeBytes);
-#endif
         m_total.m_totalCycles += info.m_totalCycles;
-#ifdef TARGET_WASM
         m_maximum.m_totalCycles = std::max(m_maximum.m_totalCycles, info.m_totalCycles);
-#else
-        m_maximum.m_totalCycles = max(m_maximum.m_totalCycles, info.m_totalCycles);
-#endif
 
 #if MEASURE_CLRAPI_CALLS
         // Update the CLR-API values.
         m_total.m_allClrAPIcalls += info.m_allClrAPIcalls;
-        m_maximum.m_allClrAPIcalls = max(m_maximum.m_allClrAPIcalls, info.m_allClrAPIcalls);
+        m_maximum.m_allClrAPIcalls = std::max(m_maximum.m_allClrAPIcalls, info.m_allClrAPIcalls);
         m_total.m_allClrAPIcycles += info.m_allClrAPIcycles;
-        m_maximum.m_allClrAPIcycles = max(m_maximum.m_allClrAPIcycles, info.m_allClrAPIcycles);
+        m_maximum.m_allClrAPIcycles = std::max(m_maximum.m_allClrAPIcycles, info.m_allClrAPIcycles);
 #endif
 
         if (includeInFiltered)
@@ -7831,22 +7823,14 @@ void CompTimeSummaryInfo::AddInfo(CompTimeInfo& info, bool includePhases)
                 m_filtered.m_CLRcyclesByPhase[i] += info.m_CLRcyclesByPhase[i];
 #endif
             }
-#ifdef TARGET_WASM
             m_maximum.m_cyclesByPhase[i] = std::max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
-#else
-            m_maximum.m_cyclesByPhase[i] = max(m_maximum.m_cyclesByPhase[i], info.m_cyclesByPhase[i]);
-#endif
 
 #if MEASURE_CLRAPI_CALLS
             m_maximum.m_CLRcyclesByPhase[i] = max(m_maximum.m_CLRcyclesByPhase[i], info.m_CLRcyclesByPhase[i]);
 #endif
         }
         m_total.m_parentPhaseEndSlop += info.m_parentPhaseEndSlop;
-#ifdef TARGET_WASM
         m_maximum.m_parentPhaseEndSlop = std::max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
-#else
-        m_maximum.m_parentPhaseEndSlop = max(m_maximum.m_parentPhaseEndSlop, info.m_parentPhaseEndSlop);
-#endif
     }
 #if MEASURE_CLRAPI_CALLS
     else
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index c593ffe04356..d38dd25df818 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -10,7 +10,7 @@
 #include "jitpch.h"
 #include <new>
 
-// these break std::min/max
+// these break std::min/max in LLVM's headers
 #undef min
 #undef max
 // this breaks StringMap.h

From 8345deaad9a8afe9118aa2af61c7b60da1c9408b Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 11 Mar 2021 18:23:42 -0500
Subject: [PATCH 25/44] first attempt to get the ci to build llvm.

---
 docs/workflow/building/coreclr/nativeaot.md |  2 +-
 eng/common/build.ps1                        |  1 +
 eng/pipelines/common/global-build-job.yml   |  4 ++-
 eng/pipelines/runtimelab/install-llvm.cmd   | 11 ++++++++
 eng/pipelines/runtimelab/install-llvm.ps1   | 30 +++++++++++++++++++++
 src/coreclr/jit/CMakeLists.txt              |  3 +--
 6 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 eng/pipelines/runtimelab/install-llvm.cmd
 create mode 100644 eng/pipelines/runtimelab/install-llvm.ps1

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index 455b48fa416f..5c415175981b 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -18,7 +18,7 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - Extract and create a subdirectory in the llvm-11.0.0.src folder called build.  cd to this build folder
 - Configure the LLVM source to use the same runtime as clrjit `cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -D LLVM_USE_CRT_DEBUG=MTd ..`
 - Build LLVM either from the command line (`build`) or from VS 2019.  You only really need to build the LLVMCore and LLVMBitWriter projects which takes less time than the 400 odd projects when building all.  This will save some time.
-- Edit `src/coreclr/jit/CMakeLists.txt` and change `find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)` to where you have built LLVM
+- set an enviroment variable to locate the LLVM config, e.g.  `set LLVM_CMAKE_CONFIG=E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm`
 - Build the x64 libraries and compiler as per the Building section.
 - Run `build nativeaot+libs+nativeaot.packages -rc [Debug|Release] -lc [Debug|Release] -a wasm -os Browser -runtimeFlavor CoreCLR`
 - The compiler can now be debugged with the Wasm clrjit.  Load the clrjit_browser_wasm32_x64.vcxproj which can be found in artifacts\obj\coreclr\windows.x64.Debug\jit
diff --git a/eng/common/build.ps1 b/eng/common/build.ps1
index d9b6b3adc038..1c5b0e6d5d57 100644
--- a/eng/common/build.ps1
+++ b/eng/common/build.ps1
@@ -143,6 +143,7 @@ try {
     }
     if ($properties.Contains('/p:TargetArchitecture=wasm') -and $runtimeFlavor -eq "CoreCLR") {
       . $PSScriptRoot\..\..\wasm-tools\emsdk\emsdk_env.ps1
+      $Env:LLVM_CMAKE_CONFIG = $PSScriptRoot\..\..\wasm-tools\llvm-11.0.0.src\build\lib\cmake\llvm
     }
     $nodeReuse = $false
   }
diff --git a/eng/pipelines/common/global-build-job.yml b/eng/pipelines/common/global-build-job.yml
index 0bce85ed875d..e4f749f4754e 100644
--- a/eng/pipelines/common/global-build-job.yml
+++ b/eng/pipelines/common/global-build-job.yml
@@ -122,9 +122,11 @@ jobs:
         displayName: Disk Usage before Build
 
     - ${{ if and(eq(parameters.runtimeFlavor, 'coreclr'), and(eq(parameters.osGroup, 'windows'), eq(parameters.platform, 'Browser_wasm'))) }}:
-      # Install Wasm dependencies: emscripten
+      # Install Wasm dependencies: emscripten, LLVM
       - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-emscripten.cmd $(Build.SourcesDirectory)\wasm-tools
         displayName: Install/activate emscripten
+      - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-llvm.cmd $(Build.SourcesDirectory)\wasm-tools
+        displayName: Install/build LLVM
 
     # Build
     - script: $(Build.SourcesDirectory)$(dir)build$(scriptExt) -ci -arch ${{ parameters.archType }} $(_osParameter) ${{ parameters.buildArgs }} $(_officialBuildParameter) $(_crossBuildPropertyArg) $(_cxx11Parameter) $(_richCodeNavigationParam)
diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
new file mode 100644
index 000000000000..762290bf9599
--- /dev/null
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -0,0 +1,11 @@
+mkdir "%1" 2>nul
+cd /D "%1"
+
+powershell -NoProfile -NoLogo -ExecutionPolicy ByPass -command "& """%~dp0install-llvm.ps1""" %*"
+if %errorlevel% NEQ 0 goto fail
+
+exit /b 0
+
+fail:
+echo "Failed to install llvm"
+exit /b 1
diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
new file mode 100644
index 000000000000..ab6f879e0d8e
--- /dev/null
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -0,0 +1,30 @@
+
+# LLVM is supplied in a gz file which Windows doesn't native understand, so we need gz to unpack it - TODO this is liable to fail randomly when a new version comes out and the version number changes
+Invoke-WebRequest -Uri https://tukaani.org/xz/xz-5.2.5-windows.zip -OutFile xz.zip
+Expand-Archive -LiteralPath xz.zip -DestinationPath .
+copy bin_i686\xz.exe . # get it in the path for tar
+
+Invoke-WebRequest -Uri https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/llvm-11.0.0.src.tar.xz -OutFile llvm-11.0.0.src.tar.xz
+
+dir
+
+./xz -d llvm-11.0.0.src.tar.xz
+
+tar -xf llvm-11.0.0.src.tar
+
+
+cd llvm-11.0.0.src
+mkdir build
+dir
+cd build
+# TODO Release build
+cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
+
+msbuild LLVM.sln /t:LLVMCore
+msbuild LLVM.sln /t:LLVMBitWriter
+#msbuild LLVM.sln /t:LLVMDebugInfoDwarf
+
+dir
+
+
+
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 1a45b95851e7..0bee1b0b8712 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -78,8 +78,7 @@ function(create_standalone_jit)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
 
-    # TODO LLVM build location?
-    find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
+    find_package(LLVM REQUIRED CONFIG PATHS  $ENV{LLVM_CMAKE_CONFIG})
     include_directories(${LLVM_INCLUDE_DIRS})
     add_definitions(${LLVM_DEFINITIONS})
     llvm_map_components_to_libnames(llvm_libs core bitwriter)

From 1b4eaf8cd79c0e60ea45c3467a1e192698127a24 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 11 Mar 2021 18:48:44 -0500
Subject: [PATCH 26/44] quote expression

---
 eng/common/build.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eng/common/build.ps1 b/eng/common/build.ps1
index 1c5b0e6d5d57..0d7628b8da70 100644
--- a/eng/common/build.ps1
+++ b/eng/common/build.ps1
@@ -143,7 +143,7 @@ try {
     }
     if ($properties.Contains('/p:TargetArchitecture=wasm') -and $runtimeFlavor -eq "CoreCLR") {
       . $PSScriptRoot\..\..\wasm-tools\emsdk\emsdk_env.ps1
-      $Env:LLVM_CMAKE_CONFIG = $PSScriptRoot\..\..\wasm-tools\llvm-11.0.0.src\build\lib\cmake\llvm
+      $Env:LLVM_CMAKE_CONFIG = "$PSScriptRoot\..\..\wasm-tools\llvm-11.0.0.src\build\lib\cmake\llvm"
     }
     $nodeReuse = $false
   }

From eb8e054831b8314bca163e0f13e6b62a3a247de5 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 11 Mar 2021 19:36:20 -0500
Subject: [PATCH 27/44] use cmake and msbuild from env

---
 eng/pipelines/runtimelab/install-llvm.ps1 | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
index ab6f879e0d8e..e2e1bf39f2bb 100644
--- a/eng/pipelines/runtimelab/install-llvm.ps1
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -1,4 +1,3 @@
-
 # LLVM is supplied in a gz file which Windows doesn't native understand, so we need gz to unpack it - TODO this is liable to fail randomly when a new version comes out and the version number changes
 Invoke-WebRequest -Uri https://tukaani.org/xz/xz-5.2.5-windows.zip -OutFile xz.zip
 Expand-Archive -LiteralPath xz.zip -DestinationPath .
@@ -17,12 +16,14 @@ cd llvm-11.0.0.src
 mkdir build
 dir
 cd build
+
+
 # TODO Release build
-cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
+"$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
 
-msbuild LLVM.sln /t:LLVMCore
-msbuild LLVM.sln /t:LLVMBitWriter
-#msbuild LLVM.sln /t:LLVMDebugInfoDwarf
+$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMCore
+$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMBitWriter
+#$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMDebugInfoDwarf
 
 dir
 

From 94da0db1306222f762c00d46a71a5ee5082cf7f3 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Thu, 11 Mar 2021 20:22:01 -0500
Subject: [PATCH 28/44] quote ps arg

---
 eng/pipelines/runtimelab/install-llvm.ps1 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
index e2e1bf39f2bb..91224376b603 100644
--- a/eng/pipelines/runtimelab/install-llvm.ps1
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -7,7 +7,7 @@ Invoke-WebRequest -Uri https://github.com/llvm/llvm-project/releases/download/ll
 
 dir
 
-./xz -d llvm-11.0.0.src.tar.xz
+./xz -d -Force llvm-11.0.0.src.tar.xz
 
 tar -xf llvm-11.0.0.src.tar
 
@@ -19,10 +19,10 @@ cd build
 
 
 # TODO Release build
-"$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
+& "$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
 
-$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMCore
-$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMBitWriter
+& "$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1" LLVM.sln /t:LLVMCore
+& "$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1" LLVM.sln /t:LLVMBitWriter
 #$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMDebugInfoDwarf
 
 dir

From 1046e0e85b818504c3a7f81fc4fa02e3aff59650 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 07:09:23 -0500
Subject: [PATCH 29/44] correct force arg

---
 eng/pipelines/runtimelab/install-llvm.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
index 91224376b603..5b0cd3fc3095 100644
--- a/eng/pipelines/runtimelab/install-llvm.ps1
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -7,7 +7,7 @@ Invoke-WebRequest -Uri https://github.com/llvm/llvm-project/releases/download/ll
 
 dir
 
-./xz -d -Force llvm-11.0.0.src.tar.xz
+./xz -d --force llvm-11.0.0.src.tar.xz
 
 tar -xf llvm-11.0.0.src.tar
 

From c6df28ebea35913b07b64cb09e56e2e9b78e2483 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 07:40:02 -0500
Subject: [PATCH 30/44] debug the env

---
 eng/pipelines/runtimelab/install-llvm.cmd | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index 762290bf9599..718a0929395e 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -1,6 +1,8 @@
 mkdir "%1" 2>nul
 cd /D "%1"
 
+set
+
 powershell -NoProfile -NoLogo -ExecutionPolicy ByPass -command "& """%~dp0install-llvm.ps1""" %*"
 if %errorlevel% NEQ 0 goto fail
 

From 03393e955938037072c39934ba3d784de32e3ee4 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 08:54:10 -0500
Subject: [PATCH 31/44] set cmakepath

---
 eng/pipelines/runtimelab/install-llvm.cmd | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index 718a0929395e..c86145528b95 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -1,6 +1,11 @@
 mkdir "%1" 2>nul
 cd /D "%1"
 
+:: Set CMakePath by evaluating the output from set-cmake-path.ps1
+call "%RepoRoot%src\coreclr\setup_vs_tools.cmd" || exit /b 1
+for /f "delims=" %%a in ('powershell -NoProfile -ExecutionPolicy ByPass "& ""%RepoRoot%eng\native\set-cmake-path.ps1"""') do %%a
+echo Using CMake at "%CMakePath%"
+
 set
 
 powershell -NoProfile -NoLogo -ExecutionPolicy ByPass -command "& """%~dp0install-llvm.ps1""" %*"

From 6e47b0d692b54803efb8bdd4a9e6b1a9bc94bbf5 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 11:25:24 -0500
Subject: [PATCH 32/44] set repoRroot from $(Build.SourcesDirectory)

---
 eng/pipelines/common/global-build-job.yml | 2 +-
 eng/pipelines/runtimelab/install-llvm.cmd | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/eng/pipelines/common/global-build-job.yml b/eng/pipelines/common/global-build-job.yml
index e4f749f4754e..fe6d51b57e23 100644
--- a/eng/pipelines/common/global-build-job.yml
+++ b/eng/pipelines/common/global-build-job.yml
@@ -125,7 +125,7 @@ jobs:
       # Install Wasm dependencies: emscripten, LLVM
       - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-emscripten.cmd $(Build.SourcesDirectory)\wasm-tools
         displayName: Install/activate emscripten
-      - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-llvm.cmd $(Build.SourcesDirectory)\wasm-tools
+      - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-llvm.cmd $(Build.SourcesDirectory)\wasm-tools $(Build.SourcesDirectory)
         displayName: Install/build LLVM
 
     # Build
diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index c86145528b95..7c71c325ecb6 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -1,6 +1,9 @@
 mkdir "%1" 2>nul
 cd /D "%1"
 
+set RepoRoot="%2"
+
+set
 :: Set CMakePath by evaluating the output from set-cmake-path.ps1
 call "%RepoRoot%src\coreclr\setup_vs_tools.cmd" || exit /b 1
 for /f "delims=" %%a in ('powershell -NoProfile -ExecutionPolicy ByPass "& ""%RepoRoot%eng\native\set-cmake-path.ps1"""') do %%a

From f8931057a62807190e7a0972a7bafc9022cbac0f Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 12:52:50 -0500
Subject: [PATCH 33/44] add trailing \

---
 eng/pipelines/runtimelab/install-llvm.cmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index 7c71c325ecb6..fec0c08b4258 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -1,7 +1,7 @@
 mkdir "%1" 2>nul
 cd /D "%1"
 
-set RepoRoot="%2"
+set RepoRoot="%2\"
 
 set
 :: Set CMakePath by evaluating the output from set-cmake-path.ps1

From e24ac19a79b904969db8d24b8f1eab241fbc9422 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 13:51:10 -0500
Subject: [PATCH 34/44] remove quotes

---
 eng/pipelines/runtimelab/install-llvm.cmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index fec0c08b4258..5bcff54beea9 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -1,7 +1,7 @@
 mkdir "%1" 2>nul
 cd /D "%1"
 
-set RepoRoot="%2\"
+set RepoRoot=%2\
 
 set
 :: Set CMakePath by evaluating the output from set-cmake-path.ps1

From 003c0251b9e5b238a166785ad588e43392e97730 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 12 Mar 2021 15:07:50 -0500
Subject: [PATCH 35/44] build using cmake not msbuild

---
 eng/pipelines/runtimelab/install-llvm.ps1 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
index 5b0cd3fc3095..dab5d50ba4ec 100644
--- a/eng/pipelines/runtimelab/install-llvm.ps1
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -21,9 +21,9 @@ cd build
 # TODO Release build
 & "$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
 
-& "$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1" LLVM.sln /t:LLVMCore
-& "$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1" LLVM.sln /t:LLVMBitWriter
-#$env:BUILD_SOURCESDIRECTORY\eng\common\msbuild.ps1 LLVM.sln /t:LLVMDebugInfoDwarf
+& "$env:CMakePath" --build . --target LLVMCore
+& "$env:CMakePath" --build . --target LLVMBitWriter
+#& "$env:CMakePath" --build . --target LLVMDebugInfoDwarf
 
 dir
 

From 74a93fa4c962b61bfaf5154d0c0217449eb2311d Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 07:29:12 -0500
Subject: [PATCH 36/44] set up clrjit build for wasm

additional exports for llvm clrjit
---
 eng/pipelines/runtimelab/install-llvm.cmd |  4 ++
 eng/pipelines/runtimelab/install-llvm.ps1 |  2 +-
 src/coreclr/jit/CMakeLists.txt            | 18 +++++---
 src/coreclr/jit/ClrJit.Llvm.exports       |  9 ++++
 src/coreclr/runtime.proj                  | 55 +++++++++++++++++++++++
 5 files changed, 80 insertions(+), 8 deletions(-)
 create mode 100644 src/coreclr/jit/ClrJit.Llvm.exports

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index 5bcff54beea9..7a815ec29fd1 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -14,6 +14,10 @@ set
 powershell -NoProfile -NoLogo -ExecutionPolicy ByPass -command "& """%~dp0install-llvm.ps1""" %*"
 if %errorlevel% NEQ 0 goto fail
 
+echo setting LLVM_CMAKE_CONFIG to %1\llvm-11.0.0.src\build
+echo "##vso[task.setvariable variable=LLVM_CMAKE_CONFIG]%1\llvm-11.0.0.src\build"
+echo "##vso[task.setvariable variable=BUILD_WASM_JIT]1"
+
 exit /b 0
 
 fail:
diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
index dab5d50ba4ec..bfd98b4378a1 100644
--- a/eng/pipelines/runtimelab/install-llvm.ps1
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -19,7 +19,7 @@ cd build
 
 
 # TODO Release build
-& "$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd  ..
+& "$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd -Thost=x64 ..
 
 & "$env:CMakePath" --build . --target LLVMCore
 & "$env:CMakePath" --build . --target LLVMBitWriter
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 0bee1b0b8712..675d3ca48c45 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -78,7 +78,7 @@ function(create_standalone_jit)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
 
-    find_package(LLVM REQUIRED CONFIG PATHS  $ENV{LLVM_CMAKE_CONFIG})
+    find_package(LLVM REQUIRED CONFIG PATHS $ENV{LLVM_CMAKE_CONFIG})
     include_directories(${LLVM_INCLUDE_DIRS})
     add_definitions(${LLVM_DEFINITIONS})
     llvm_map_components_to_libnames(llvm_libs core bitwriter)
@@ -547,18 +547,22 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
+  if (NOT CLR_CMAKE_HOST_UNIX AND $ENV{BUILD_WASM_JIT})
+    # LLVM clrjit has an extra export - registerLlvmCallbacks
+    set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
+    set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
+    preprocess_file (${CLRJIT_EXPORTS} ${JIT_EXPORTS_FILE})
+    set(JIT_DEF_FILE ${JIT_EXPORTS_FILE})
+    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+  endif (NOT CLR_CMAKE_HOST_UNIX AND $ENV{BUILD_WASM_JIT})
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
   endif(CLR_CMAKE_TARGET_UNIX)
 endif (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
 
-if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
-    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
-    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
-    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-endif (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
-
 if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_PGO_INSTRUMENT)
   # Copy PGO dependency to target dir
   set(PGORT_DLL "pgort140.dll")
diff --git a/src/coreclr/jit/ClrJit.Llvm.exports b/src/coreclr/jit/ClrJit.Llvm.exports
new file mode 100644
index 000000000000..3adee7c3d36a
--- /dev/null
+++ b/src/coreclr/jit/ClrJit.Llvm.exports
@@ -0,0 +1,9 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+
+EXPORTS
+    getJit
+    jitStartup
+    jitShutdown
+    registerLlvmCallbacks
+
diff --git a/src/coreclr/runtime.proj b/src/coreclr/runtime.proj
index 502090b0c5f8..1661ae0ff5b5 100644
--- a/src/coreclr/runtime.proj
+++ b/src/coreclr/runtime.proj
@@ -45,4 +45,59 @@
     <Exec Command="&quot;$(MSBuildThisFileDirectory)$(_CoreClrBuildScript)&quot; @(_CoreClrBuildArg->'%(Identity)',' ')"
           IgnoreStandardErrorWarningFormat="true" />
   </Target>
+
+  <Target Name="BuildRuntimeWasmClrJit"
+        AfterTargets="BuildRuntime" Condition="'$(TargetArchitecture)' == 'wasm'">
+    <ItemGroup>
+      <_CoreClrBuildArg Remove="@(_CoreClrBuildArg)"/>
+
+      <_CoreClrBuildArg Include="-x64" />
+      <_CoreClrBuildArg Condition="!$([MSBuild]::IsOsPlatform(Windows)) and '$(CMakeArgs)' != ''" Include="$(CMakeArgs)" />
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and '$(CMakeArgs)' != ''" Include="-cmakeargs &quot;$(CMakeArgs)&quot;" />
+      <_CoreClrBuildArg Include="-$(Configuration.ToLower())" />
+      <_CoreClrBuildArg Include="$(Compiler)" />
+      <_CoreClrBuildArg Condition="'$(ContinuousIntegrationBuild)' == 'true'" Include="-ci" />
+      <_CoreClrBuildArg Condition="'$(CrossBuild)' == 'true'" Include="-cross" />
+      <_CoreClrBuildArg Condition="'$(PortableBuild)' != 'true'" Include="-portablebuild=false" />
+      <_CoreClrBuildArg Condition="'$(KeepNativeSymbols)' != 'false'" Include="-keepnativesymbols" />
+      <_CoreClrBuildArg Condition="!$([MSBuild]::IsOsPlatform(Windows))" Include="-os $(TargetOS)" />
+
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and
+                                   ('$(TargetArchitecture)' == 'x86' or '$(TargetArchitecture)' == 'x64') and
+                                   '$(Configuration)' == 'Release' and
+                                   '$(ClrRuntimeSubset)' == 'true' and
+                                   '$(NoPgoOptimize)' != 'true' and
+                                   '$(PgoInstrument)' != 'true'"
+                                   Include="-enforcepgo" />
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and '$(CrossDac)' != ''" Include="-$(CrossDac)dac" />
+      <_CoreClrBuildArg Condition="'$(Ninja)' == 'true'" Include="-ninja" />
+      <_CoreClrBuildArg Condition="'$(ClrRuntimeSubset)' != 'true'" Include="-skipruntime" />
+      <_CoreClrBuildArg Condition="'$(ClrJitSubset)' != 'true'" Include="-skipjit" />
+      <_CoreClrBuildArg Condition="'$(ClrPalTestsSubset)' == 'true'" Include="-paltests" />
+      <_CoreClrBuildArg Condition="'$(ClrAllJitsSubset)' != 'true'" Include="-skipalljits" />
+      <_CoreClrBuildArg Condition="'$(PgoInstrument)' == 'true'" Include="-pgoinstrument" />
+      <_CoreClrBuildArg Condition="'$(NoPgoOptimize)' == 'true' or '$(PgoInstrument)' == 'true'" Include="-nopgooptimize" />
+      <_CoreClrBuildArg Condition="'$(OfficialBuildId)' != ''" Include="/p:OfficialBuildId=$(OfficialBuildId)" />
+    </ItemGroup>
+
+    <PropertyGroup>
+      <_CoreClrBuildScript Condition="$([MSBuild]::IsOsPlatform(Windows))">build-runtime.cmd</_CoreClrBuildScript>
+      <_CoreClrBuildScript Condition="!$([MSBuild]::IsOsPlatform(Windows))">build-runtime.sh</_CoreClrBuildScript>
+    </PropertyGroup>
+
+    <!-- Use IgnoreStandardErrorWarningFormat because Arcade sets WarnAsError and there's an existing warning in the native build. -->
+    <Exec Command="&quot;$(MSBuildThisFileDirectory)$(_CoreClrBuildScript)&quot; @(_CoreClrBuildArg->'%(Identity)',' ')"
+          IgnoreStandardErrorWarningFormat="true" />
+
+    <!-- TODO: make this less fragile -->
+    <ItemGroup>
+        <ClrJitFiles Include="$(RepoRoot)artifacts\bin\coreclr\windows.x64.$(Configuration)\*.dll"/>
+    </ItemGroup>
+    <Copy
+      SourceFiles="@(ClrJitFiles)"
+      DestinationFolder="$(RepoRoot)artifacts\bin\coreclr\$(TargetOS).$(TargetArchitecture).$(Configuration)\ilc"
+      OverwriteReadOnlyFiles="True"
+      >
+    </Copy>
+  </Target>
 </Project>

From e18bb91c290c49954f682c04a0cd919d163a2bca Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 09:12:21 -0500
Subject: [PATCH 37/44] fix cmake for undefined env var

---
 src/coreclr/jit/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 675d3ca48c45..0e6aed125cc8 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -547,7 +547,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  if (NOT CLR_CMAKE_HOST_UNIX AND $ENV{BUILD_WASM_JIT})
+  if (NOT CLR_CMAKE_HOST_UNIX AND DEFINED(ENV{BUILD_WASM_JIT}))
     # LLVM clrjit has an extra export - registerLlvmCallbacks
     set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
     set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
@@ -556,7 +556,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
     create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
     # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
     #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-  endif (NOT CLR_CMAKE_HOST_UNIX AND $ENV{BUILD_WASM_JIT})
+  endif (NOT CLR_CMAKE_HOST_UNIX AND DEFINED(ENV{BUILD_WASM_JIT}))
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})

From e6dd70dfe408881d9f0871763961aa397096cd04 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 14:09:57 -0500
Subject: [PATCH 38/44] remove env var

---
 eng/pipelines/runtimelab/install-llvm.cmd | 1 -
 src/coreclr/jit/CMakeLists.txt            | 4 ++--
 src/coreclr/runtime.proj                  | 4 ++++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
index 7a815ec29fd1..aa0d3c299dcc 100644
--- a/eng/pipelines/runtimelab/install-llvm.cmd
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -16,7 +16,6 @@ if %errorlevel% NEQ 0 goto fail
 
 echo setting LLVM_CMAKE_CONFIG to %1\llvm-11.0.0.src\build
 echo "##vso[task.setvariable variable=LLVM_CMAKE_CONFIG]%1\llvm-11.0.0.src\build"
-echo "##vso[task.setvariable variable=BUILD_WASM_JIT]1"
 
 exit /b 0
 
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 0e6aed125cc8..bb56c21b8985 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -547,7 +547,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  if (NOT CLR_CMAKE_HOST_UNIX AND DEFINED(ENV{BUILD_WASM_JIT}))
+  if (NOT CLR_CMAKE_HOST_UNIX AND ${BUILD_WASM_JIT})
     # LLVM clrjit has an extra export - registerLlvmCallbacks
     set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
     set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
@@ -556,7 +556,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
     create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
     # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
     #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-  endif (NOT CLR_CMAKE_HOST_UNIX AND DEFINED(ENV{BUILD_WASM_JIT}))
+  endif (NOT CLR_CMAKE_HOST_UNIX AND ${BUILD_WASM_JIT})
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
diff --git a/src/coreclr/runtime.proj b/src/coreclr/runtime.proj
index 1661ae0ff5b5..07b6955cf82b 100644
--- a/src/coreclr/runtime.proj
+++ b/src/coreclr/runtime.proj
@@ -48,6 +48,10 @@
 
   <Target Name="BuildRuntimeWasmClrJit"
         AfterTargets="BuildRuntime" Condition="'$(TargetArchitecture)' == 'wasm'">
+    <PropertyGroup>
+      <CMakeArgs Condition="'$(CMakeArgs)' != ''">$(CMakeArgs) -D BUILD_WASM_JIT=1</CMakeArgs>
+      <CMakeArgs Condition="'$(CMakeArgs)' == ''">-DBUILD_WASM_JIT=1</CMakeArgs>
+    </PropertyGroup>
     <ItemGroup>
       <_CoreClrBuildArg Remove="@(_CoreClrBuildArg)"/>
 

From efbf3ffbd1c0206423b5480a0850b95b433c3c8a Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 14:10:24 -0500
Subject: [PATCH 39/44] ws

---
 src/coreclr/runtime.proj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/runtime.proj b/src/coreclr/runtime.proj
index 07b6955cf82b..117fd681fa65 100644
--- a/src/coreclr/runtime.proj
+++ b/src/coreclr/runtime.proj
@@ -49,7 +49,7 @@
   <Target Name="BuildRuntimeWasmClrJit"
         AfterTargets="BuildRuntime" Condition="'$(TargetArchitecture)' == 'wasm'">
     <PropertyGroup>
-      <CMakeArgs Condition="'$(CMakeArgs)' != ''">$(CMakeArgs) -D BUILD_WASM_JIT=1</CMakeArgs>
+      <CMakeArgs Condition="'$(CMakeArgs)' != ''">$(CMakeArgs) -DBUILD_WASM_JIT=1</CMakeArgs>
       <CMakeArgs Condition="'$(CMakeArgs)' == ''">-DBUILD_WASM_JIT=1</CMakeArgs>
     </PropertyGroup>
     <ItemGroup>

From 627c5b55193d74586652aa8c777e88c941c8e52b Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 14:59:30 -0500
Subject: [PATCH 40/44] fix for undefined

---
 src/coreclr/jit/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index bb56c21b8985..cea43de30e58 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -547,7 +547,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
-  if (NOT CLR_CMAKE_HOST_UNIX AND ${BUILD_WASM_JIT})
+  if (NOT CLR_CMAKE_HOST_UNIX AND BUILD_WASM_JIT)
     # LLVM clrjit has an extra export - registerLlvmCallbacks
     set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
     set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
@@ -556,7 +556,7 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
     create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
     # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
     #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-  endif (NOT CLR_CMAKE_HOST_UNIX AND ${BUILD_WASM_JIT})
+  endif (NOT CLR_CMAKE_HOST_UNIX AND BUILD_WASM_JIT)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})

From 33a1b1a61ac5da932af5dbcacefe232ff1c8a760 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Mon, 15 Mar 2021 17:20:02 -0500
Subject: [PATCH 41/44] Enable CI be adding step to download and build LLVM
 (also need xv as that is the compression Llvm use)

Add build of clrjit for Windows x64 Debug only.  This means that WebAssembly can only be built on that combination (as is the tested case currently).
---
 docs/workflow/building/coreclr/nativeaot.md |  2 +-
 eng/common/build.ps1                        |  1 +
 eng/pipelines/common/global-build-job.yml   |  4 +-
 eng/pipelines/runtimelab/install-llvm.cmd   | 24 +++++++++
 eng/pipelines/runtimelab/install-llvm.ps1   | 31 +++++++++++
 src/coreclr/jit/CMakeLists.txt              | 19 ++++---
 src/coreclr/jit/ClrJit.Llvm.exports         |  9 ++++
 src/coreclr/runtime.proj                    | 59 +++++++++++++++++++++
 8 files changed, 139 insertions(+), 10 deletions(-)
 create mode 100644 eng/pipelines/runtimelab/install-llvm.cmd
 create mode 100644 eng/pipelines/runtimelab/install-llvm.ps1
 create mode 100644 src/coreclr/jit/ClrJit.Llvm.exports

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index 455b48fa416f..5c415175981b 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -18,7 +18,7 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - Extract and create a subdirectory in the llvm-11.0.0.src folder called build.  cd to this build folder
 - Configure the LLVM source to use the same runtime as clrjit `cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -D LLVM_USE_CRT_DEBUG=MTd ..`
 - Build LLVM either from the command line (`build`) or from VS 2019.  You only really need to build the LLVMCore and LLVMBitWriter projects which takes less time than the 400 odd projects when building all.  This will save some time.
-- Edit `src/coreclr/jit/CMakeLists.txt` and change `find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)` to where you have built LLVM
+- set an enviroment variable to locate the LLVM config, e.g.  `set LLVM_CMAKE_CONFIG=E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm`
 - Build the x64 libraries and compiler as per the Building section.
 - Run `build nativeaot+libs+nativeaot.packages -rc [Debug|Release] -lc [Debug|Release] -a wasm -os Browser -runtimeFlavor CoreCLR`
 - The compiler can now be debugged with the Wasm clrjit.  Load the clrjit_browser_wasm32_x64.vcxproj which can be found in artifacts\obj\coreclr\windows.x64.Debug\jit
diff --git a/eng/common/build.ps1 b/eng/common/build.ps1
index d9b6b3adc038..0d7628b8da70 100644
--- a/eng/common/build.ps1
+++ b/eng/common/build.ps1
@@ -143,6 +143,7 @@ try {
     }
     if ($properties.Contains('/p:TargetArchitecture=wasm') -and $runtimeFlavor -eq "CoreCLR") {
       . $PSScriptRoot\..\..\wasm-tools\emsdk\emsdk_env.ps1
+      $Env:LLVM_CMAKE_CONFIG = "$PSScriptRoot\..\..\wasm-tools\llvm-11.0.0.src\build\lib\cmake\llvm"
     }
     $nodeReuse = $false
   }
diff --git a/eng/pipelines/common/global-build-job.yml b/eng/pipelines/common/global-build-job.yml
index 0bce85ed875d..fe6d51b57e23 100644
--- a/eng/pipelines/common/global-build-job.yml
+++ b/eng/pipelines/common/global-build-job.yml
@@ -122,9 +122,11 @@ jobs:
         displayName: Disk Usage before Build
 
     - ${{ if and(eq(parameters.runtimeFlavor, 'coreclr'), and(eq(parameters.osGroup, 'windows'), eq(parameters.platform, 'Browser_wasm'))) }}:
-      # Install Wasm dependencies: emscripten
+      # Install Wasm dependencies: emscripten, LLVM
       - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-emscripten.cmd $(Build.SourcesDirectory)\wasm-tools
         displayName: Install/activate emscripten
+      - script: call $(Build.SourcesDirectory)/eng/pipelines/runtimelab/install-llvm.cmd $(Build.SourcesDirectory)\wasm-tools $(Build.SourcesDirectory)
+        displayName: Install/build LLVM
 
     # Build
     - script: $(Build.SourcesDirectory)$(dir)build$(scriptExt) -ci -arch ${{ parameters.archType }} $(_osParameter) ${{ parameters.buildArgs }} $(_officialBuildParameter) $(_crossBuildPropertyArg) $(_cxx11Parameter) $(_richCodeNavigationParam)
diff --git a/eng/pipelines/runtimelab/install-llvm.cmd b/eng/pipelines/runtimelab/install-llvm.cmd
new file mode 100644
index 000000000000..aa0d3c299dcc
--- /dev/null
+++ b/eng/pipelines/runtimelab/install-llvm.cmd
@@ -0,0 +1,24 @@
+mkdir "%1" 2>nul
+cd /D "%1"
+
+set RepoRoot=%2\
+
+set
+:: Set CMakePath by evaluating the output from set-cmake-path.ps1
+call "%RepoRoot%src\coreclr\setup_vs_tools.cmd" || exit /b 1
+for /f "delims=" %%a in ('powershell -NoProfile -ExecutionPolicy ByPass "& ""%RepoRoot%eng\native\set-cmake-path.ps1"""') do %%a
+echo Using CMake at "%CMakePath%"
+
+set
+
+powershell -NoProfile -NoLogo -ExecutionPolicy ByPass -command "& """%~dp0install-llvm.ps1""" %*"
+if %errorlevel% NEQ 0 goto fail
+
+echo setting LLVM_CMAKE_CONFIG to %1\llvm-11.0.0.src\build
+echo "##vso[task.setvariable variable=LLVM_CMAKE_CONFIG]%1\llvm-11.0.0.src\build"
+
+exit /b 0
+
+fail:
+echo "Failed to install llvm"
+exit /b 1
diff --git a/eng/pipelines/runtimelab/install-llvm.ps1 b/eng/pipelines/runtimelab/install-llvm.ps1
new file mode 100644
index 000000000000..bfd98b4378a1
--- /dev/null
+++ b/eng/pipelines/runtimelab/install-llvm.ps1
@@ -0,0 +1,31 @@
+# LLVM is supplied in a gz file which Windows doesn't native understand, so we need gz to unpack it - TODO this is liable to fail randomly when a new version comes out and the version number changes
+Invoke-WebRequest -Uri https://tukaani.org/xz/xz-5.2.5-windows.zip -OutFile xz.zip
+Expand-Archive -LiteralPath xz.zip -DestinationPath .
+copy bin_i686\xz.exe . # get it in the path for tar
+
+Invoke-WebRequest -Uri https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/llvm-11.0.0.src.tar.xz -OutFile llvm-11.0.0.src.tar.xz
+
+dir
+
+./xz -d --force llvm-11.0.0.src.tar.xz
+
+tar -xf llvm-11.0.0.src.tar
+
+
+cd llvm-11.0.0.src
+mkdir build
+dir
+cd build
+
+
+# TODO Release build
+& "$env:CMakePath" -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DLLVM_USE_CRT_DEBUG=MTd -Thost=x64 ..
+
+& "$env:CMakePath" --build . --target LLVMCore
+& "$env:CMakePath" --build . --target LLVMBitWriter
+#& "$env:CMakePath" --build . --target LLVMDebugInfoDwarf
+
+dir
+
+
+
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index 1a45b95851e7..cea43de30e58 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -78,8 +78,7 @@ function(create_standalone_jit)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE USE_STL)
     target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE PAL_STDCPP_COMPAT)
 
-    # TODO LLVM build location?
-    find_package(LLVM REQUIRED CONFIG PATHS E:/llvm11/llvm-11.0.0.src/build/lib/cmake/llvm)
+    find_package(LLVM REQUIRED CONFIG PATHS $ENV{LLVM_CMAKE_CONFIG})
     include_directories(${LLVM_INCLUDE_DIRS})
     add_definitions(${LLVM_DEFINITIONS})
     llvm_map_components_to_libnames(llvm_libs core bitwriter)
@@ -548,18 +547,22 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_unix_arm_${ARCH_HOST_NAME} OS unix ARCH arm)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
+  if (NOT CLR_CMAKE_HOST_UNIX AND BUILD_WASM_JIT)
+    # LLVM clrjit has an extra export - registerLlvmCallbacks
+    set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
+    set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
+    preprocess_file (${CLRJIT_EXPORTS} ${JIT_EXPORTS_FILE})
+    set(JIT_DEF_FILE ${JIT_EXPORTS_FILE})
+    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
+    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
+    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
+  endif (NOT CLR_CMAKE_HOST_UNIX AND BUILD_WASM_JIT)
 else()
   if (CLR_CMAKE_TARGET_UNIX)
     create_standalone_jit(TARGET clrjit_unix_${ARCH_TARGET_NAME}_${ARCH_HOST_NAME} OS unix ARCH ${ARCH_TARGET_NAME})
   endif(CLR_CMAKE_TARGET_UNIX)
 endif (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
 
-if (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
-    create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
-    # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
-    #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
-endif (TARGETDETAILS_ARCH STREQUAL "wasm64" OR TARGETDETAILS_ARCH STREQUAL "wasm32")
-
 if (CLR_CMAKE_TARGET_WIN32 AND CLR_CMAKE_PGO_INSTRUMENT)
   # Copy PGO dependency to target dir
   set(PGORT_DLL "pgort140.dll")
diff --git a/src/coreclr/jit/ClrJit.Llvm.exports b/src/coreclr/jit/ClrJit.Llvm.exports
new file mode 100644
index 000000000000..3adee7c3d36a
--- /dev/null
+++ b/src/coreclr/jit/ClrJit.Llvm.exports
@@ -0,0 +1,9 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+
+EXPORTS
+    getJit
+    jitStartup
+    jitShutdown
+    registerLlvmCallbacks
+
diff --git a/src/coreclr/runtime.proj b/src/coreclr/runtime.proj
index 502090b0c5f8..117fd681fa65 100644
--- a/src/coreclr/runtime.proj
+++ b/src/coreclr/runtime.proj
@@ -45,4 +45,63 @@
     <Exec Command="&quot;$(MSBuildThisFileDirectory)$(_CoreClrBuildScript)&quot; @(_CoreClrBuildArg->'%(Identity)',' ')"
           IgnoreStandardErrorWarningFormat="true" />
   </Target>
+
+  <Target Name="BuildRuntimeWasmClrJit"
+        AfterTargets="BuildRuntime" Condition="'$(TargetArchitecture)' == 'wasm'">
+    <PropertyGroup>
+      <CMakeArgs Condition="'$(CMakeArgs)' != ''">$(CMakeArgs) -DBUILD_WASM_JIT=1</CMakeArgs>
+      <CMakeArgs Condition="'$(CMakeArgs)' == ''">-DBUILD_WASM_JIT=1</CMakeArgs>
+    </PropertyGroup>
+    <ItemGroup>
+      <_CoreClrBuildArg Remove="@(_CoreClrBuildArg)"/>
+
+      <_CoreClrBuildArg Include="-x64" />
+      <_CoreClrBuildArg Condition="!$([MSBuild]::IsOsPlatform(Windows)) and '$(CMakeArgs)' != ''" Include="$(CMakeArgs)" />
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and '$(CMakeArgs)' != ''" Include="-cmakeargs &quot;$(CMakeArgs)&quot;" />
+      <_CoreClrBuildArg Include="-$(Configuration.ToLower())" />
+      <_CoreClrBuildArg Include="$(Compiler)" />
+      <_CoreClrBuildArg Condition="'$(ContinuousIntegrationBuild)' == 'true'" Include="-ci" />
+      <_CoreClrBuildArg Condition="'$(CrossBuild)' == 'true'" Include="-cross" />
+      <_CoreClrBuildArg Condition="'$(PortableBuild)' != 'true'" Include="-portablebuild=false" />
+      <_CoreClrBuildArg Condition="'$(KeepNativeSymbols)' != 'false'" Include="-keepnativesymbols" />
+      <_CoreClrBuildArg Condition="!$([MSBuild]::IsOsPlatform(Windows))" Include="-os $(TargetOS)" />
+
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and
+                                   ('$(TargetArchitecture)' == 'x86' or '$(TargetArchitecture)' == 'x64') and
+                                   '$(Configuration)' == 'Release' and
+                                   '$(ClrRuntimeSubset)' == 'true' and
+                                   '$(NoPgoOptimize)' != 'true' and
+                                   '$(PgoInstrument)' != 'true'"
+                                   Include="-enforcepgo" />
+      <_CoreClrBuildArg Condition="$([MSBuild]::IsOsPlatform(Windows)) and '$(CrossDac)' != ''" Include="-$(CrossDac)dac" />
+      <_CoreClrBuildArg Condition="'$(Ninja)' == 'true'" Include="-ninja" />
+      <_CoreClrBuildArg Condition="'$(ClrRuntimeSubset)' != 'true'" Include="-skipruntime" />
+      <_CoreClrBuildArg Condition="'$(ClrJitSubset)' != 'true'" Include="-skipjit" />
+      <_CoreClrBuildArg Condition="'$(ClrPalTestsSubset)' == 'true'" Include="-paltests" />
+      <_CoreClrBuildArg Condition="'$(ClrAllJitsSubset)' != 'true'" Include="-skipalljits" />
+      <_CoreClrBuildArg Condition="'$(PgoInstrument)' == 'true'" Include="-pgoinstrument" />
+      <_CoreClrBuildArg Condition="'$(NoPgoOptimize)' == 'true' or '$(PgoInstrument)' == 'true'" Include="-nopgooptimize" />
+      <_CoreClrBuildArg Condition="'$(OfficialBuildId)' != ''" Include="/p:OfficialBuildId=$(OfficialBuildId)" />
+    </ItemGroup>
+
+    <PropertyGroup>
+      <_CoreClrBuildScript Condition="$([MSBuild]::IsOsPlatform(Windows))">build-runtime.cmd</_CoreClrBuildScript>
+      <_CoreClrBuildScript Condition="!$([MSBuild]::IsOsPlatform(Windows))">build-runtime.sh</_CoreClrBuildScript>
+    </PropertyGroup>
+
+    <!-- Use IgnoreStandardErrorWarningFormat because Arcade sets WarnAsError and there's an existing warning in the native build. -->
+    <Exec Command="&quot;$(MSBuildThisFileDirectory)$(_CoreClrBuildScript)&quot; @(_CoreClrBuildArg->'%(Identity)',' ')"
+          IgnoreStandardErrorWarningFormat="true" />
+
+    <!-- TODO: make this less fragile -->
+    <ItemGroup>
+        <ClrJitFiles Include="$(RepoRoot)artifacts\bin\coreclr\windows.x64.$(Configuration)\*.dll"/>
+    </ItemGroup>
+    <Copy
+      SourceFiles="@(ClrJitFiles)"
+      DestinationFolder="$(RepoRoot)artifacts\bin\coreclr\$(TargetOS).$(TargetArchitecture).$(Configuration)\ilc"
+      OverwriteReadOnlyFiles="True"
+      >
+    </Copy>
+  </Target>
 </Project>

From 9b5c306f8983f90036a0f2a49c7c0d2393260c36 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Wed, 17 Mar 2021 07:19:30 -0500
Subject: [PATCH 42/44] typo but/put

Co-authored-by: Jan Kotas <jkotas@microsoft.com>
---
 docs/workflow/building/coreclr/nativeaot.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/workflow/building/coreclr/nativeaot.md b/docs/workflow/building/coreclr/nativeaot.md
index d8b4c6b7a714..fb39bd707de7 100644
--- a/docs/workflow/building/coreclr/nativeaot.md
+++ b/docs/workflow/building/coreclr/nativeaot.md
@@ -29,7 +29,7 @@ The Native AOT toolchain can be currently built for Linux, macOS and Windows x64
 - To work on the clr jit for LLVM:
 - Open the Ilc solution and add the clr jit project `clrjit_browser_wasm32_x64.vcxproj` from `artifacts\obj\coreclr\windows.x64.Debug\jit`
 - In the project properties General section, change the output folder to the full path for `artifacts\bin\coreclr\windows.x64.Debug\ilc\net5.0` e.g. `E:\GitHub\runtimelab\artifacts\bin\coreclr\windows.x64.Debug\ilc\net5.0`
-- Build `clrjit_browser_wasm32_x64` project and you should now be able to change and but breakpoints in the c++ code.
+- Build `clrjit_browser_wasm32_x64` project and you should now be able to change and put breakpoints in the c++ code.
 
 
 ## Visual Studio Solutions

From 76fb9b5a3ffc78b4164526e4a4f706b4fac3cbe9 Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Fri, 19 Mar 2021 17:41:07 -0500
Subject: [PATCH 43/44] move genType../genActualTypes to compiler.cpp

---
 src/coreclr/jit/codegencommon.cpp | 26 --------------------------
 src/coreclr/jit/compiler.cpp      | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 24c4c936ba45..2c18ce3fe287 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -30,32 +30,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #include "patchpointinfo.h"
 
 /*****************************************************************************/
-#endif //!TARGET_WASM
-
-const BYTE genTypeSizes[] = {
-#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) sz,
-#include "typelist.h"
-#undef DEF_TP
-};
-
-const BYTE genTypeAlignments[] = {
-#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) al,
-#include "typelist.h"
-#undef DEF_TP
-};
-
-const BYTE genTypeStSzs[] = {
-#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) st,
-#include "typelist.h"
-#undef DEF_TP
-};
-
-const BYTE genActualTypes[] = {
-#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) jitType,
-#include "typelist.h"
-#undef DEF_TP
-};
-#ifndef TARGET_WASM
 
 void CodeGenInterface::setFramePointerRequiredEH(bool value)
 {
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index cca45918f4dc..4e6e4b88c3f6 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -130,6 +130,30 @@ inline unsigned getCurTime()
     return (((tim.wHour * 60) + tim.wMinute) * 60 + tim.wSecond) * 1000 + tim.wMilliseconds;
 }
 
+const BYTE genTypeSizes[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) sz,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genTypeAlignments[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) al,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genTypeStSzs[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) st,
+#include "typelist.h"
+#undef DEF_TP
+};
+
+const BYTE genActualTypes[] = {
+#define DEF_TP(tn, nm, jitType, verType, sz, sze, asze, st, al, tf, howUsed) jitType,
+#include "typelist.h"
+#undef DEF_TP
+};
+
 /*****************************************************************************/
 #ifdef DEBUG
 /*****************************************************************************/

From e59d342a0fd2aa2117da2147b6da8d8d3e3cc64e Mon Sep 17 00:00:00 2001
From: yowl <scott.waye@hubse.com>
Date: Sun, 21 Mar 2021 07:38:34 -0500
Subject: [PATCH 44/44] address feedback

- Remove unnecessary ifdefs
- Replace ifdefs with cmake exclusions
- Fix passing of corinfo
- remove some comments
---
 src/coreclr/inc/switches.h                    |   2 +-
 src/coreclr/jit/CMakeLists.txt                |  17 +-
 src/coreclr/jit/codegencommon.cpp             |   3 +-
 src/coreclr/jit/codegenlinear.cpp             |   2 -
 src/coreclr/jit/compiler.cpp                  |   9 +-
 src/coreclr/jit/compiler.h                    |   6 +-
 src/coreclr/jit/emit.cpp                      |   3 +-
 src/coreclr/jit/gcencode.cpp                  |   3 +-
 src/coreclr/jit/importer.cpp                  |   6 +-
 src/coreclr/jit/instr.cpp                     |  19 +-
 src/coreclr/jit/instr.h                       |   8 -
 src/coreclr/jit/llvm.cpp                      |  26 +-
 src/coreclr/jit/llvm.h                        |   3 -
 src/coreclr/jit/lower.cpp                     |   2 -
 src/coreclr/jit/lsra.cpp                      |   2 -
 src/coreclr/jit/lsra.h                        |   2 +-
 src/coreclr/jit/lsrabuild.cpp                 |   2 -
 src/coreclr/jit/morph.cpp                     |  21 +-
 src/coreclr/jit/regalloc.cpp                  |   6 -
 src/coreclr/jit/stacklevelsetter.cpp          |   3 -
 src/coreclr/jit/typelist.h                    |   4 -
 src/coreclr/jit/unwindwasm.cpp                | 426 ------------------
 .../CodeGen/ILToLLVMImporter.cs               |  68 ---
 .../Compiler/LLVMCodegenCompilation.cs        |  10 +-
 .../Compiler/RyuJitLLVMCompilation.cs         | 200 --------
 .../ILCompiler.LLVM/ILCompiler.LLVM.csproj    |   1 -
 .../JitInterface/CorInfoImpl.Llvm.cs          |  22 +-
 27 files changed, 60 insertions(+), 816 deletions(-)
 delete mode 100644 src/coreclr/jit/unwindwasm.cpp
 delete mode 100644 src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs

diff --git a/src/coreclr/inc/switches.h b/src/coreclr/inc/switches.h
index 51a55c0a839a..12e63753ed55 100644
--- a/src/coreclr/inc/switches.h
+++ b/src/coreclr/inc/switches.h
@@ -66,7 +66,7 @@
 #endif // !HOST_UNIX
 
 #elif defined(TARGET_WASM)
-    #define USE_UPPER_ADDRESS       0 // TODO : what's this?
+    #define USE_UPPER_ADDRESS       0 // not used but is required to be defined
 #else
     #error Please add a new #elif clause and define all portability macros for the new platform
 #endif
diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt
index cea43de30e58..ead9b73e12a0 100644
--- a/src/coreclr/jit/CMakeLists.txt
+++ b/src/coreclr/jit/CMakeLists.txt
@@ -329,7 +329,6 @@ set( JIT_WASM64_SOURCES
   simdashwintrinsic.cpp
   simdcodegenxarch.cpp
   targetwasm.cpp
-  unwindwasm.cpp
   hwintrinsicxarch.cpp
   hwintrinsiccodegenxarch.cpp
   llvm.cpp
@@ -339,7 +338,6 @@ set( JIT_WASM32_SOURCES
   simdashwintrinsic.cpp
   simdcodegenxarch.cpp
   targetwasm.cpp
-  unwindwasm.cpp
   hwintrinsicxarch.cpp
   hwintrinsiccodegenxarch.cpp
   llvm.cpp
@@ -548,11 +546,26 @@ if (CLR_CMAKE_BUILD_SUBSET_ALLJITS AND NOT CLR_CROSS_COMPONENTS_BUILD)
   create_standalone_jit(TARGET clrjit_win_arm_${ARCH_HOST_NAME} OS win ARCH arm)
   create_standalone_jit(TARGET clrjit_win_x86_${ARCH_HOST_NAME} OS win ARCH x86)
   if (NOT CLR_CMAKE_HOST_UNIX AND BUILD_WASM_JIT)
+    # the LLVM clrjit needs to be the last clrjit to use create_standalone_jit as it modifies some cmake variables
     # LLVM clrjit has an extra export - registerLlvmCallbacks
     set(CLRJIT_EXPORTS ${CMAKE_CURRENT_LIST_DIR}/ClrJit.Llvm.exports)
     set(JIT_EXPORTS_FILE ${CMAKE_CURRENT_BINARY_DIR}/ClrJit.Llvm.exports.def)
     preprocess_file (${CLRJIT_EXPORTS} ${JIT_EXPORTS_FILE})
     set(JIT_DEF_FILE ${JIT_EXPORTS_FILE})
+
+    # exclude cpp files that are not required when not processing beyond rationalized LIR
+    # use REGEX as this list conatins the absolute paths
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX stacklevelsetter\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX codegencommon\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX codegenlinear\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX emit\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX gcencode\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX instr\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX lower\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX lsra\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX lsrabuild\.cpp)
+    list(FILTER JIT_CORE_SOURCES EXCLUDE REGEX regalloc\.cpp)
+
     create_standalone_jit(TARGET clrjit_browser_wasm32_${ARCH_HOST_NAME} OS browser ARCH wasm32)
     # uncomment to enable 8 byte pointer size version of the wasm clrjit.dll
     #create_standalone_jit(TARGET clrjit_browser_wasm64_${ARCH_HOST_NAME} OS browser ARCH wasm64)
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 2c18ce3fe287..418ccaa9385c 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -13,7 +13,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
 // TODO-Cleanup: There are additional methods in CodeGen*.cpp that are almost
 // identical, and which should probably be moved here.
-#ifndef TARGET_WASM
+
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -12997,4 +12997,3 @@ void CodeGenInterface::VariableLiveKeeper::dumpLvaVariableLiveRanges() const
 }
 #endif // DEBUG
 #endif // USING_VARIABLE_LIVE_RANGE
-#endif // TARGET_WASM
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index dc86ff503587..215e3c04f75b 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -9,7 +9,6 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -2649,4 +2648,3 @@ void CodeGen::genCodeForSetcc(GenTreeCC* setcc)
     inst_SETCC(setcc->gtCondition, setcc->TypeGet(), setcc->GetRegNum());
     genProduceReg(setcc);
 }
-#endif // TARGET_WASM
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 4e6e4b88c3f6..b281d22fb9d7 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -29,11 +29,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
 #if defined(TARGET_WASM)
 #include "llvm.h"
-#else
-// TODO: how to get different exports.def for the different clrjits?
-void registerLlvmCallbacks(void* thisPtr, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*))
-{
-}
 #endif
 
 #if defined(DEBUG)
@@ -4475,7 +4470,6 @@ inline void DoLlvmPhase(Compiler* pCompiler)
     Llvm* llvm = new Llvm();
     llvm->Compile(pCompiler);
     delete llvm;
-    //assert(false);
 }
 #endif
 
@@ -5180,8 +5174,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
     rat.Run();
 
 #if defined(TARGET_WASM)
-    // TODO:after rat, but better before?
-    DoLlvmPhase(this); // DoPhase?
+    DoLlvmPhase(this);
 #else
 
     // Here we do "simple lowering".  When the RyuJIT backend works for all
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 8236f14401ba..16cbfb93a643 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -5781,7 +5781,9 @@ class Compiler
     bool     fgCheckStmtAfterTailCall();
     GenTree* fgMorphTailCallViaHelpers(GenTreeCall* call, CORINFO_TAILCALL_HELPERS& help);
     bool fgCanTailCallViaJitHelper();
+#ifdef TARGET_X86
     void fgMorphTailCallViaJitHelper(GenTreeCall* call);
+#endif
     GenTree* fgCreateCallDispatcherAndGetResult(GenTreeCall*          origCall,
                                                 CORINFO_METHOD_HANDLE callTargetStubHnd,
                                                 CORINFO_METHOD_HANDLE dispatcherHnd);
@@ -11293,14 +11295,10 @@ const instruction INS_SQRT = INS_fsqrt;
 
 /*****************************************************************************/
 
-#ifndef TARGET_WASM
 extern const BYTE genTypeSizes[];
-#endif //!TARGET_WASM
 extern const BYTE genTypeAlignments[];
-#ifndef TARGET_WASM
 extern const BYTE genTypeStSzs[];
 extern const BYTE genActualTypes[];
-#endif //!TARGET_WASM
 
 /*****************************************************************************/
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index ff08e44eb1b0..ad044c9dbebc 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -9,7 +9,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
+
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -8586,4 +8586,3 @@ regMaskTP emitter::emitGetGCRegsKilledByNoGCCall(CorInfoHelpFunc helper)
 
     return result;
 }
-#endif // TARGET_WASM
diff --git a/src/coreclr/jit/gcencode.cpp b/src/coreclr/jit/gcencode.cpp
index 9a0c700ddff2..bfe9dbe3ee91 100644
--- a/src/coreclr/jit/gcencode.cpp
+++ b/src/coreclr/jit/gcencode.cpp
@@ -11,7 +11,7 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
+
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -4832,7 +4832,6 @@ void GCInfo::gcInfoRecordGCStackArgsDead(GcInfoEncoder* gcInfoEncoder,
 #undef GCENCODER_WITH_LOGGING
 
 #endif // !JIT32_GCENCODER
-#endif // !TARGET_WASM
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index 5f627c72bf21..a725b363cb32 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -19774,11 +19774,7 @@ void Compiler::impInlineInitVars(InlineInfo* pInlineInfo)
                         return;
                     }
                 }
-#ifndef TARGET_WASM
-                else if (genTypeSize(sigType) < EA_PTRSIZE)
-#else
-                else if (genTypeSize(sigType) < TARGET_POINTER_SIZE) // TODO: is this sensible in the abscence of EA_PTRSIZE?
-#endif
+                else if (genTypeSize(sigType) < TARGET_POINTER_SIZE)
                 {
                     // Narrowing cast.
                     if (inlArgNode->OperIs(GT_LCL_VAR))
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index 1524ab3ab1bb..d6e2951fa894 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -11,7 +11,6 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -66,15 +65,6 @@ const char* CodeGen::genInsName(instruction ins)
         #define INST9(id, nm, ldst, fmt, e1, e2, e3, e4, e5, e6, e7, e8, e9 ) nm,
         #include "instrs.h"
 
-#elif defined(TARGET_WASM)
-        #define INST0(id, nm, um, mr,                 flags) nm,
-        #define INST1(id, nm, um, mr,                 flags) nm,
-        #define INST2(id, nm, um, mr, mi,             flags) nm,
-        #define INST3(id, nm, um, mr, mi, rm,         flags) nm,
-        #define INST4(id, nm, um, mr, mi, rm, a4,     flags) nm,
-        #define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) nm,
-        #include "instrs.h"
-
 #else
 #error "Unknown TARGET"
 #endif
@@ -1859,7 +1849,7 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType)
     {
         return ins_Copy(dstType);
     }
-#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
+#if defined(TARGET_XARCH)
     return INS_movd;
 #elif defined(TARGET_ARM64)
     if (dstIsFloatReg)
@@ -2318,7 +2308,7 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
     }
 #endif // DEBUG
 
-#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
+#if defined(TARGET_XARCH)
     // only full barrier needs to be emitted on Xarch
     if (barrierKind != BARRIER_FULL)
     {
@@ -2380,7 +2370,7 @@ void CodeGen::instGen_Compare_Reg_To_Zero(emitAttr size, regNumber reg)
  */
 void CodeGen::instGen_Compare_Reg_To_Reg(emitAttr size, regNumber reg1, regNumber reg2)
 {
-#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH) || defined(TARGET_WASM) // TODO Wasm
+#if defined(TARGET_XARCH) || defined(TARGET_ARMARCH)
     GetEmitter()->emitIns_R_R(INS_cmp, size, reg1, reg2);
 #else
 #error "Unknown TARGET"
@@ -2400,7 +2390,7 @@ void CodeGen::instGen_Compare_Reg_To_Imm(emitAttr size, regNumber reg, target_ss
     }
     else
     {
-#if defined(TARGET_XARCH) || defined(TARGET_WASM) // TODO Wasm
+#if defined(TARGET_XARCH)
 #if defined(TARGET_AMD64)
         if ((EA_SIZE(size) == EA_8BYTE) && (((int)imm != (ssize_t)imm) || EA_IS_CNS_RELOC(size)))
         {
@@ -2456,7 +2446,6 @@ void CodeGen::instGen_Store_Reg_Into_Lcl(var_types dstType, regNumber srcReg, in
 
     GetEmitter()->emitIns_S_R(ins_Store(dstType), size, srcReg, varNum, offs);
 }
-#endif // !TARGET_WASM
 
 /*****************************************************************************/
 /*****************************************************************************/
diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
index b6ebbd78a86d..f8e31e6bcc40 100644
--- a/src/coreclr/jit/instr.h
+++ b/src/coreclr/jit/instr.h
@@ -48,14 +48,6 @@ enum instruction : unsigned
 
     INS_lea,   // Not a real instruction. It is used for load the address of stack locals
 
-#elif defined(TARGET_WASM)
-#define INST0(id, nm, um, mr,                 flags) INS_##id,
-#define INST1(id, nm, um, mr,                 flags) INS_##id,
-#define INST2(id, nm, um, mr, mi,             flags) INS_##id,
-#define INST3(id, nm, um, mr, mi, rm,         flags) INS_##id,
-#define INST4(id, nm, um, mr, mi, rm, a4,     flags) INS_##id,
-#define INST5(id, nm, um, mr, mi, rm, a4, rr, flags) INS_##id,
-#include "instrs.h"
 #else
 #error Unsupported target architecture
 #endif
diff --git a/src/coreclr/jit/llvm.cpp b/src/coreclr/jit/llvm.cpp
index 779a39fddb23..e85f153a6339 100644
--- a/src/coreclr/jit/llvm.cpp
+++ b/src/coreclr/jit/llvm.cpp
@@ -21,9 +21,9 @@ using llvm::LLVMContext;
 using llvm::ArrayRef;
 using llvm::Module;
 
-static Module* _module;
+static Module* _module = nullptr;
 static LLVMContext _llvmContext;
-static void* _thisPtr;
+static void* _thisPtr; // TODO: workaround for not changing the JIT/EE interface.  As this is static, it will probably fail if multithreaded compilation is attempted
 static const char* (*_getMangledMethodName)(void*, CORINFO_METHOD_STRUCT_*);
 static char* _outputFileName;
 static Function* _doNothingFunction;
@@ -34,16 +34,17 @@ extern "C" DLLEXPORT void registerLlvmCallbacks(void* thisPtr, const char* outpu
 {
     _thisPtr = thisPtr;
     _getMangledMethodName = getMangledMethodNamePtr;
-    _module = new Module(llvm::StringRef("netscripten-clrjit"), _llvmContext);
-    _module->setTargetTriple(triple);
-    _module->setDataLayout(dataLayout);
-
-//    _outputFileName = getAllocator(CMK_DebugOnly).allocate<char>(strlen(outputFileName) + 1)
-    _outputFileName = (char*)malloc(strlen(outputFileName) + 7);
-    strcpy(_outputFileName, "1.txt"); // ??? without this _outputFileName is corrupted
-    strcpy(_outputFileName, outputFileName);
-    strcpy(_outputFileName + strlen(_outputFileName) - 3, "clrjit"); // use different module output name for now, TODO: delete if old LLVM gen does not create a module
-    strcat(_outputFileName, ".bc");
+    if (_module == nullptr) // registerLlvmCallbacks is called for each method to compile, but must only created the module once.  Better perhaps to split this into 2 calls.
+    {
+        _module = new Module(llvm::StringRef("netscripten-clrjit"), _llvmContext);
+        _module->setTargetTriple(triple);
+        _module->setDataLayout(dataLayout);
+        _outputFileName = (char*)malloc(strlen(outputFileName) + 7);
+        strcpy(_outputFileName, "1.txt"); // ??? without this _outputFileName is corrupted
+        strcpy(_outputFileName, outputFileName);
+        strcpy(_outputFileName + strlen(_outputFileName) - 3, "clrjit"); // use different module output name for now, TODO: delete if old LLVM gen does not create a module
+        strcat(_outputFileName, ".bc");
+    }
 }
 
 void Llvm::Init()
@@ -63,7 +64,6 @@ void Llvm::llvmShutdown()
 #endif //DEBUG
     llvm::raw_fd_ostream OS(_outputFileName, ec);
     llvm::WriteBitcodeToFile(*_module, OS);
-    //_module->end();
     delete _module;
 //    Module.Verify(LLVMVerifierFailureAction.LLVMAbortProcessAction);
 }
diff --git a/src/coreclr/jit/llvm.h b/src/coreclr/jit/llvm.h
index d38dd25df818..907273192130 100644
--- a/src/coreclr/jit/llvm.h
+++ b/src/coreclr/jit/llvm.h
@@ -20,9 +20,6 @@
 #define IMAGE_FILE_MACHINE_WASM32             0xFFFF
 #define IMAGE_FILE_MACHINE_WASM64             0xFFFE // TODO: appropriate values for this?  Used to check compilation is for intended target
 
-
-
-
 extern "C" void registerLlvmCallbacks(void* thisPtr, const char* outputFileName, const char* triple, const char* dataLayout, const char* (*getMangledMethodNamePtr)(void*, CORINFO_METHOD_STRUCT_*));
 
 class Llvm
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 2d8ca1a05f84..4a5468cb3c58 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -14,7 +14,6 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -6693,4 +6692,3 @@ bool Lowering::TryTransformStoreObjAsStoreInd(GenTreeBlk* blkNode)
     LowerStoreIndirCommon(blkNode);
     return true;
 }
-#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp
index aa1da5249426..996dcac0dd9f 100644
--- a/src/coreclr/jit/lsra.cpp
+++ b/src/coreclr/jit/lsra.cpp
@@ -89,7 +89,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -11297,4 +11296,3 @@ void LinearScan::verifyResolutionMove(GenTree* resolutionMove, LsraLocation curr
     }
 }
 #endif // DEBUG
-#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
index ac5318764c0e..7df658560985 100644
--- a/src/coreclr/jit/lsra.h
+++ b/src/coreclr/jit/lsra.h
@@ -1764,7 +1764,7 @@ class LinearScan : public LinearScanInterface
     int BuildGCWriteBarrier(GenTree* tree);
     int BuildCast(GenTreeCast* cast);
 
-#ifdef TARGET_XARCH
+#if defined(TARGET_XARCH)
     // returns true if the tree can use the read-modify-write memory instruction form
     bool isRMWRegOper(GenTree* tree);
     int BuildMul(GenTree* tree);
diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp
index 3899abd600ac..06b6799c1824 100644
--- a/src/coreclr/jit/lsrabuild.cpp
+++ b/src/coreclr/jit/lsrabuild.cpp
@@ -13,7 +13,6 @@ XX                                                                           XX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 */
-#ifndef TARGET_WASM
 
 #include "jitpch.h"
 #ifdef _MSC_VER
@@ -3906,4 +3905,3 @@ int LinearScan::BuildCmp(GenTree* tree)
     }
     return srcCount;
 }
-#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index a75c392a8d3a..28591171a776 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -7500,11 +7500,13 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
 
         // On x86 we have a faster mechanism than the general one which we use
         // in almost all cases. See fgCanTailCallViaJitHelper for more information.
+#ifdef TARGET_X86
         if (fgCanTailCallViaJitHelper())
         {
             tailCallViaJitHelper = true;
         }
-        else
+#endif
+        if (!tailCallViaJitHelper)
         {
             // Make sure we can get the helpers. We do this last as the runtime
             // will likely be required to generate these.
@@ -7744,6 +7746,7 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
 
         // Do some target-specific transformations (before we process the args,
         // etc.) for the JIT helper case.
+#ifdef TARGET_X86
         if (tailCallViaJitHelper)
         {
             fgMorphTailCallViaJitHelper(call);
@@ -7752,7 +7755,7 @@ GenTree* Compiler::fgMorphPotentialTailCall(GenTreeCall* call)
             // argument list, invalidating the argInfo.
             call->fgArgInfo = nullptr;
         }
-
+#endif
         // Tail call via JIT helper: The VM can't use return address hijacking
         // if we're not going to return and the helper doesn't have enough info
         // to safely poll, so we poll before the tail call, if the block isn't
@@ -8460,6 +8463,7 @@ GenTree* Compiler::getTokenHandleTree(CORINFO_RESOLVED_TOKEN* pResolvedToken, bo
     return result;
 }
 
+#ifdef TARGET_X86
 /*****************************************************************************
  *
  *  Transform the given GT_CALL tree for tail call via JIT helper.
@@ -8624,13 +8628,8 @@ void Compiler::fgMorphTailCallViaJitHelper(GenTreeCall* call)
     assert(ppArg != nullptr);
     assert(*ppArg == nullptr);
 
-#ifndef TARGET_WASM
     unsigned nOldStkArgsWords =
         (compArgSize - (codeGen->intRegState.rsCalleeRegArgCount * REGSIZE_BYTES)) / REGSIZE_BYTES;
-#else
-    unsigned nOldStkArgsWords = 0;
-    assert(false); // TODO: Wasm: what to do here?
-#endif // !TARGET_WASM
     GenTree* arg3 = gtNewIconNode((ssize_t)nOldStkArgsWords, TYP_I_IMPL);
     *ppArg        = gtNewCallArgs(arg3); // numberOfOldStackArgs
     ppArg         = &((*ppArg)->NextRef());
@@ -8662,6 +8661,7 @@ void Compiler::fgMorphTailCallViaJitHelper(GenTreeCall* call)
     JITDUMP("fgMorphTailCallViaJitHelper (after):\n");
     DISPTREE(call);
 }
+#endif //TARGET_X86
 
 //------------------------------------------------------------------------
 // fgGetStubAddrArg: Return the virtual stub address for the given call.
@@ -19120,6 +19120,7 @@ bool Compiler::fgCheckStmtAfterTailCall()
     return nextMorphStmt == nullptr;
 }
 
+#ifdef TARGET_X86
 //------------------------------------------------------------------------
 // fgCanTailCallViaJitHelper: check whether we can use the faster tailcall
 // JIT helper on x86.
@@ -19129,17 +19130,13 @@ bool Compiler::fgCheckStmtAfterTailCall()
 //
 bool Compiler::fgCanTailCallViaJitHelper()
 {
-#ifndef TARGET_X86
-    // On anything except X86 we have no faster mechanism available.
-    return false;
-#else
     // The JIT helper does not properly handle the case where localloc was used.
     if (compLocallocUsed)
         return false;
 
     return true;
-#endif
 }
+#endif
 
 static const int      numberOfTrackedFlags               = 5;
 static const unsigned trackedFlags[numberOfTrackedFlags] = {GTF_ASG, GTF_CALL, GTF_EXCEPT, GTF_GLOB_REF,
diff --git a/src/coreclr/jit/regalloc.cpp b/src/coreclr/jit/regalloc.cpp
index 258b2b0cb2f6..5e609b66d519 100644
--- a/src/coreclr/jit/regalloc.cpp
+++ b/src/coreclr/jit/regalloc.cpp
@@ -109,7 +109,6 @@ bool Compiler::shouldDoubleAlign(unsigned             refCntStk,
 }
 #endif // DOUBLE_ALIGN
 
-#ifndef TARGET_WASM
 // The code to set the regState for each arg is outlined for shared use
 // by linear scan. (It is not shared for System V AMD64 platform.)
 regNumber Compiler::raUpdateRegStateForArg(RegState* regState, LclVarDsc* argDsc)
@@ -195,7 +194,6 @@ regNumber Compiler::raUpdateRegStateForArg(RegState* regState, LclVarDsc* argDsc
 
     return inArgReg;
 }
-#endif // !TARGET_WASM
 
 /****************************************************************************/
 /* Returns true when we must create an EBP frame
@@ -387,11 +385,7 @@ void Compiler::raMarkStkVars()
                                   // stack frame
 
     NOT_STK:;
-#ifndef TARGET_WASM
         varDsc->lvFramePointerBased = codeGen->isFramePointerUsed();
-#else
-        varDsc->lvFramePointerBased = false; // TODO Wasm sensible default?
-#endif
 
 #if DOUBLE_ALIGN
 
diff --git a/src/coreclr/jit/stacklevelsetter.cpp b/src/coreclr/jit/stacklevelsetter.cpp
index 1746cf492be6..361a4faadf53 100644
--- a/src/coreclr/jit/stacklevelsetter.cpp
+++ b/src/coreclr/jit/stacklevelsetter.cpp
@@ -1,8 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#ifndef TARGET_WASM
-
 #include "jitpch.h"
 #ifdef _MSC_VER
 #pragma hdrstop
@@ -359,4 +357,3 @@ void StackLevelSetter::CheckAdditionalArgs()
     }
 #endif // TARGET_X86
 }
-#endif // !TARGET_WASM
diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h
index b61a630a85f1..21032081d5f2 100644
--- a/src/coreclr/jit/typelist.h
+++ b/src/coreclr/jit/typelist.h
@@ -3,11 +3,7 @@
 
 #define GCS EA_GCREF
 #define BRS EA_BYREF
-#if defined(TARGET_WASM)
 #define PS TARGET_POINTER_SIZE
-#else
-#define PS EA_PTRSIZE
-#endif
 #define PST (TARGET_POINTER_SIZE / sizeof(int))
 
 #ifdef TARGET_64BIT
diff --git a/src/coreclr/jit/unwindwasm.cpp b/src/coreclr/jit/unwindwasm.cpp
deleted file mode 100644
index 4306917bc131..000000000000
--- a/src/coreclr/jit/unwindwasm.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XX                                                                           XX
-XX                              UnwindInfo                                   XX
-XX                                                                           XX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
-*/
-
-#include "jitpch.h"
-#ifdef _MSC_VER
-#pragma hdrstop
-#endif
-
-#if defined(TARGET_WASM)
-typedef union _UNWIND_CODE {
-    struct {
-        UCHAR CodeOffset;
-        UCHAR UnwindOp : 4;
-        UCHAR OpInfo : 4;
-    };
-
-    struct {
-        UCHAR OffsetLow;
-        UCHAR UnwindOp : 4;
-        UCHAR OffsetHigh : 4;
-    } EpilogueCode;
-
-    USHORT FrameOffset;
-} UNWIND_CODE, * PUNWIND_CODE;
-typedef struct _UNWIND_INFO {
-    UCHAR Version : 3;
-    UCHAR Flags : 5;
-    UCHAR SizeOfProlog;
-    UCHAR CountOfUnwindCodes;
-    UCHAR FrameRegister : 4;
-    UCHAR FrameOffset : 4;
-    UNWIND_CODE UnwindCode[1];
-
-    //
-    // The unwind codes are followed by an optional DWORD aligned field that
-    // contains the exception handler address or the address of chained unwind
-    // information. If an exception handler address is specified, then it is
-    // followed by the language specified exception handler data.
-    //
-    //  union {
-    //      ULONG ExceptionHandler;
-    //      ULONG FunctionEntry;
-    //  };
-    //
-    //  ULONG ExceptionData[];
-    //
-
-} UNWIND_INFO, * PUNWIND_INFO;
-
-#ifdef UNIX_AMD64_ABI
-short Compiler::mapRegNumToDwarfReg(regNumber reg)
-{
-    short dwarfReg = DWARF_REG_ILLEGAL;
-
-    switch (reg)
-    {
-        case REG_RAX:
-            dwarfReg = 0;
-            break;
-        case REG_RCX:
-            dwarfReg = 2;
-            break;
-        case REG_RDX:
-            dwarfReg = 1;
-            break;
-        case REG_RBX:
-            dwarfReg = 3;
-            break;
-        case REG_RSP:
-            dwarfReg = 7;
-            break;
-        case REG_RBP:
-            dwarfReg = 6;
-            break;
-        case REG_RSI:
-            dwarfReg = 4;
-            break;
-        case REG_RDI:
-            dwarfReg = 5;
-            break;
-        case REG_R8:
-            dwarfReg = 8;
-            break;
-        case REG_R9:
-            dwarfReg = 9;
-            break;
-        case REG_R10:
-            dwarfReg = 10;
-            break;
-        case REG_R11:
-            dwarfReg = 11;
-            break;
-        case REG_R12:
-            dwarfReg = 12;
-            break;
-        case REG_R13:
-            dwarfReg = 13;
-            break;
-        case REG_R14:
-            dwarfReg = 14;
-            break;
-        case REG_R15:
-            dwarfReg = 15;
-            break;
-        case REG_XMM0:
-            dwarfReg = 17;
-            break;
-        case REG_XMM1:
-            dwarfReg = 18;
-            break;
-        case REG_XMM2:
-            dwarfReg = 19;
-            break;
-        case REG_XMM3:
-            dwarfReg = 20;
-            break;
-        case REG_XMM4:
-            dwarfReg = 21;
-            break;
-        case REG_XMM5:
-            dwarfReg = 22;
-            break;
-        case REG_XMM6:
-            dwarfReg = 23;
-            break;
-        case REG_XMM7:
-            dwarfReg = 24;
-            break;
-        case REG_XMM8:
-            dwarfReg = 25;
-            break;
-        case REG_XMM9:
-            dwarfReg = 26;
-            break;
-        case REG_XMM10:
-            dwarfReg = 27;
-            break;
-        case REG_XMM11:
-            dwarfReg = 28;
-            break;
-        case REG_XMM12:
-            dwarfReg = 29;
-            break;
-        case REG_XMM13:
-            dwarfReg = 30;
-            break;
-        case REG_XMM14:
-            dwarfReg = 31;
-            break;
-        case REG_XMM15:
-            dwarfReg = 32;
-            break;
-        default:
-            noway_assert(!"unexpected REG_NUM");
-    }
-
-    return dwarfReg;
-}
-
-#endif // UNIX_AMD64_ABI
-
-//------------------------------------------------------------------------
-// Compiler::unwindBegProlog: Initialize the unwind info data structures.
-// Called at the beginning of main function or funclet prolog generation.
-//
-void Compiler::unwindBegProlog()
-{
-#ifdef UNIX_AMD64_ABI
-    if (generateCFIUnwindCodes())
-    {
-        unwindBegPrologCFI();
-    }
-    else
-#endif // UNIX_AMD64_ABI
-    {
-        unwindBegPrologWindows();
-    }
-}
-
-void Compiler::unwindBegPrologWindows()
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindEndProlog: Called at the end of main function or funclet
-// prolog generation to indicate there is no more unwind information for this prolog.
-//
-void Compiler::unwindEndProlog()
-{
-    assert(compGeneratingProlog);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindBegEpilog: Called at the beginning of main function or funclet
-// epilog generation.
-//
-void Compiler::unwindBegEpilog()
-{
-    assert(compGeneratingEpilog);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindEndEpilog: Called at the end of main function or funclet
-// epilog generation.
-//
-void Compiler::unwindEndEpilog()
-{
-    assert(compGeneratingEpilog);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindPush: Record a push/save of a register.
-//
-// Arguments:
-//    reg - The register being pushed/saved.
-//
-void Compiler::unwindPush(regNumber reg)
-{
-#ifdef UNIX_AMD64_ABI
-    if (generateCFIUnwindCodes())
-    {
-        unwindPushPopCFI(reg);
-    }
-    else
-#endif // UNIX_AMD64_ABI
-    {
-        unwindPushWindows(reg);
-    }
-}
-
-void Compiler::unwindPushWindows(regNumber reg)
-{
-    assert(false);
-}
-
-#ifdef UNIX_AMD64_ABI
-#endif // UNIX_AMD64_ABI
-
-//------------------------------------------------------------------------
-// Compiler::unwindAllocStack: Record a stack frame allocation (sub sp, X).
-//
-// Arguments:
-//    size - The size of the stack frame allocation (the amount subtracted from the stack pointer).
-//
-void Compiler::unwindAllocStack(unsigned size)
-{
-#ifdef UNIX_AMD64_ABI
-    if (generateCFIUnwindCodes())
-    {
-        unwindAllocStackCFI(size);
-    }
-    else
-#endif // UNIX_AMD64_ABI
-    {
-        unwindAllocStackWindows(size);
-    }
-}
-
-void Compiler::unwindAllocStackWindows(unsigned size)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindSetFrameReg: Record a frame register.
-//
-// Arguments:
-//    reg    - The register being set as the frame register.
-//    offset - The offset from the current stack pointer that the frame pointer will point at.
-//
-void Compiler::unwindSetFrameReg(regNumber reg, unsigned offset)
-{
-#ifdef UNIX_AMD64_ABI
-    if (generateCFIUnwindCodes())
-    {
-        unwindSetFrameRegCFI(reg, offset);
-    }
-    else
-#endif // UNIX_AMD64_ABI
-    {
-        unwindSetFrameRegWindows(reg, offset);
-    }
-}
-
-void Compiler::unwindSetFrameRegWindows(regNumber reg, unsigned offset)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindSaveReg: Record a register save.
-//
-// Arguments:
-//    reg    - The register being saved.
-//    offset - The offset from the current stack pointer where the register is being saved.
-//
-void Compiler::unwindSaveReg(regNumber reg, unsigned offset)
-{
-#ifdef UNIX_AMD64_ABI
-    if (generateCFIUnwindCodes())
-    {
-        unwindSaveRegCFI(reg, offset);
-    }
-    else
-#endif // UNIX_AMD64_ABI
-    {
-        unwindSaveRegWindows(reg, offset);
-    }
-}
-
-void Compiler::unwindSaveRegWindows(regNumber reg, unsigned offset)
-{
-    assert(false);
-}
-
-#ifdef UNIX_AMD64_ABI
-void Compiler::unwindSaveRegCFI(regNumber reg, unsigned offset)
-{
-    assert(compGeneratingProlog);
-
-    if (RBM_CALLEE_SAVED & genRegMask(reg))
-    {
-        FuncInfoDsc* func = funCurrentFunc();
-
-        unsigned int cbProlog = unwindGetCurrentOffset(func);
-        createCfiCode(func, cbProlog, CFI_REL_OFFSET, mapRegNumToDwarfReg(reg), offset);
-    }
-}
-#endif // UNIX_AMD64_ABI
-
-#ifdef DEBUG
-
-//------------------------------------------------------------------------
-// DumpUnwindInfo: Dump the unwind data.
-//
-// Arguments:
-//    isHotCode   - true if this unwind data is for the hot section, false otherwise.
-//    startOffset - byte offset of the code start that this unwind data represents.
-//    endOffset   - byte offset of the code end   that this unwind data represents.
-//    pHeader     - pointer to the unwind data blob.
-//
-void DumpUnwindInfo(bool                     isHotCode,
-                    UNATIVE_OFFSET           startOffset,
-                    UNATIVE_OFFSET           endOffset,
-                    const UNWIND_INFO* const pHeader)
-{
-    assert(false);
-}
-
-#endif // DEBUG
-
-//------------------------------------------------------------------------
-// Compiler::unwindReserve: Ask the VM to reserve space for the unwind information
-// for the function and all its funclets. Called once, just before asking the VM
-// for memory and emitting the generated code. Calls unwindReserveFunc() to handle
-// the main function and each of the funclets, in turn.
-//
-void Compiler::unwindReserve()
-{
-    assert(!compGeneratingProlog);
-    assert(!compGeneratingEpilog);
-
-    assert(compFuncInfoCount > 0);
-    for (unsigned funcIdx = 0; funcIdx < compFuncInfoCount; funcIdx++)
-    {
-        unwindReserveFunc(funGetFunc(funcIdx));
-    }
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindReserveFunc: Reserve the unwind information from the VM for a
-// given main function or funclet.
-//
-// Arguments:
-//    func - The main function or funclet to reserve unwind info for.
-//
-void Compiler::unwindReserveFunc(FuncInfoDsc* func)
-{
-    assert(false);
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindEmit: Report all the unwind information to the VM.
-//
-// Arguments:
-//    pHotCode  - Pointer to the beginning of the memory with the function and funclet hot  code.
-//    pColdCode - Pointer to the beginning of the memory with the function and funclet cold code.
-//
-void Compiler::unwindEmit(void* pHotCode, void* pColdCode)
-{
-    assert(!compGeneratingProlog);
-    assert(!compGeneratingEpilog);
-
-    assert(compFuncInfoCount > 0);
-    for (unsigned funcIdx = 0; funcIdx < compFuncInfoCount; funcIdx++)
-    {
-        unwindEmitFunc(funGetFunc(funcIdx), pHotCode, pColdCode);
-    }
-}
-
-//------------------------------------------------------------------------
-// Compiler::unwindEmitFunc: Report the unwind information to the VM for a
-// given main function or funclet. Reports the hot section, then the cold
-// section if necessary.
-//
-// Arguments:
-//    func      - The main function or funclet to reserve unwind info for.
-//    pHotCode  - Pointer to the beginning of the memory with the function and funclet hot  code.
-//    pColdCode - Pointer to the beginning of the memory with the function and funclet cold code.
-//
-void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode)
-{
-    assert(false);
-}
-
-#endif // defined(TARGET_WASM)
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
index 9a8d544256f1..7e50166e1a2e 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/CodeGen/ILToLLVMImporter.cs
@@ -175,74 +175,6 @@ public ILImporter(LLVMCodegenCompilation compilation, MethodDesc method, MethodI
             _builder = Context.CreateBuilder();
         }
 
-//         [DllImport(JitSupportLibrary)]
-//         private extern static CorJitResult JitCompileMethod(out IntPtr exception,
-//             IntPtr jit, IntPtr thisHandle, IntPtr callbacks,
-//             ref CORINFO_METHOD_INFO info, uint flags, out IntPtr nativeEntry, out uint codeSize);
-//
-//         // replaces (totally?) Import method
-//         public void ImportRyuJit()
-//         {
-//             IntPtr exception;
-//             IntPtr nativeEntry;
-//             uint codeSize;
-//             var result = JitCompileMethod(out exception,
-//                     _jit, (IntPtr)Unsafe.AsPointer(ref _this), _unmanagedCallbacks,
-//                     ref methodInfo, (uint)CorJitFlag.CORJIT_FLAG_CALL_GETJITFLAGS, out nativeEntry, out codeSize);
-//             if (exception != IntPtr.Zero)
-//             {
-//                 if (_lastException != null)
-//                 {
-//                     // If we captured a managed exception, rethrow that.
-//                     // TODO: might not actually be the real reason. It could be e.g. a JIT failure/bad IL that followed
-//                     // an inlining attempt with a type system problem in it...
-// #if SUPPORT_JIT
-//                     _lastException.Throw();
-// #else
-//                     if (_lastException.SourceException is TypeSystemException)
-//                     {
-//                         // Type system exceptions can be turned into code that throws the exception at runtime.
-//                         _lastException.Throw();
-//                     }
-//                     else
-//                     {
-//                         // This is just a bug somewhere.
-//                         throw new CodeGenerationFailedException(_methodCodeNode.Method, _lastException.SourceException);
-//                     }
-// #endif
-//                 }
-//
-//                 // This is a failure we don't know much about.
-//                 char* szMessage = GetExceptionMessage(exception);
-//                 string message = szMessage != null ? new string(szMessage) : "JIT Exception";
-//                 throw new Exception(message);
-//             }
-//             if (result == CorJitResult.CORJIT_BADCODE)
-//             {
-//                 ThrowHelper.ThrowInvalidProgramException();
-//             }
-//             if (result == CorJitResult.CORJIT_IMPLLIMITATION)
-//             {
-// #if READYTORUN
-//                 throw new RequiresRuntimeJitException("JIT implementation limitation");
-// #else
-//                 ThrowHelper.ThrowInvalidProgramException();
-// #endif
-//             }
-//             if (result != CorJitResult.CORJIT_OK)
-//             {
-// #if SUPPORT_JIT
-//                 // FailFast?
-//                 throw new Exception("JIT failed");
-// #else
-//                 throw new CodeGenerationFailedException(_methodCodeNode.Method);
-// #endif
-//             }
-//
-//
-//         }
-
-
         public void Import()
         {
             FindBasicBlocks();
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
index db3ad5a0ac7a..7b62d6c7560c 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/LLVMCodegenCompilation.cs
@@ -97,12 +97,7 @@ protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCor
 
         private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
         {
-            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread =>
-            {
-                var impl = new CorInfoImpl(this);
-                impl.RegisterLlvmCallbacks(_outputFile, Module.Target, Module.DataLayout);
-                return impl;
-            });
+            CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
 
             foreach (LLVMMethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
             {
@@ -120,7 +115,7 @@ private void CompileSingleThreaded(List<LLVMMethodCodeNode> methodsToCompile)
 
         static int totalMethodCount;
         static int ryuJitMethodCount;
-        private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodCodeNodeNeedingCode)
+        private unsafe void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodCodeNodeNeedingCode)
         {
             MethodDesc method = methodCodeNodeNeedingCode.Method;
 
@@ -130,6 +125,7 @@ private void CompileSingleMethod(CorInfoImpl corInfo, LLVMMethodCodeNode methodC
                 if (sig.Length == 0 && sig.ReturnType == TypeSystemContext.GetWellKnownType(WellKnownType.Void) &&
                     sig.IsStatic) // speed up
                 {
+                    corInfo.RegisterLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref corInfo), _outputFile, Module.Target, Module.DataLayout);
                     corInfo.CompileMethod(methodCodeNodeNeedingCode);
                     methodCodeNodeNeedingCode.CompilationCompleted = true;
                     methodCodeNodeNeedingCode.SetDependencies(new DependencyNodeCore<NodeFactory>.DependencyList()); // TODO: how to track - check RyuJITCompilation
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs b/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs
deleted file mode 100644
index 53f8b6ca4e34..000000000000
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/Compiler/RyuJitLLVMCompilation.cs
+++ /dev/null
@@ -1,200 +0,0 @@
-// // Licensed to the .NET Foundation under one or more agreements.
-// // The .NET Foundation licenses this file to you under the MIT license.
-//
-// using System;
-// using System.Collections.Generic;
-// using System.Runtime.CompilerServices;
-// using System.Threading;
-//
-// using ILCompiler.DependencyAnalysis;
-// using ILCompiler.DependencyAnalysisFramework;
-//
-// using Internal.IL;
-// using Internal.IL.Stubs;
-// using Internal.TypeSystem;
-// using Internal.JitInterface;
-//
-// namespace ILCompiler
-// {
-//     public sealed class RyuJitLLLVMCompilation : Compilation
-//     {
-//         private readonly ConditionalWeakTable<Thread, CorInfoImpl> _corinfos = new ConditionalWeakTable<Thread, CorInfoImpl>();
-//         internal readonly RyuJitCompilationOptions _compilationOptions;
-//         private readonly ExternSymbolMappedField _hardwareIntrinsicFlags;
-//         private CountdownEvent _compilationCountdown;
-//         private readonly Dictionary<string, InstructionSet> _instructionSetMap;
-//
-//         public InstructionSetSupport InstructionSetSupport { get; }
-//
-//         internal RyuJitLLLVMCompilation(
-//             DependencyAnalyzerBase<NodeFactory> dependencyGraph,
-//             NodeFactory nodeFactory,
-//             IEnumerable<ICompilationRootProvider> roots,
-//             ILProvider ilProvider,
-//             DebugInformationProvider debugInformationProvider,
-//             Logger logger,
-//             DevirtualizationManager devirtualizationManager,
-//             InstructionSetSupport instructionSetSupport,
-//             RyuJitCompilationOptions options)
-//             : base(dependencyGraph, nodeFactory, roots, ilProvider, debugInformationProvider, devirtualizationManager, logger)
-//         {
-//             _compilationOptions = options;
-//             _hardwareIntrinsicFlags = new ExternSymbolMappedField(nodeFactory.TypeSystemContext.GetWellKnownType(WellKnownType.Int32), "g_cpuFeatures");
-//             InstructionSetSupport = instructionSetSupport;
-//
-//             _instructionSetMap = new Dictionary<string, InstructionSet>();
-//             foreach (var instructionSetInfo in InstructionSetFlags.ArchitectureToValidInstructionSets(TypeSystemContext.Target.Architecture))
-//             {
-//                 if (!instructionSetInfo.Specifiable)
-//                     continue;
-//
-//                 _instructionSetMap.Add(instructionSetInfo.ManagedName, instructionSetInfo.InstructionSet);
-//             }
-//         }
-//
-//         protected override void CompileInternal(string outputFile, ObjectDumper dumper)
-//         {
-//             _dependencyGraph.ComputeMarkedNodes();
-//             var nodes = _dependencyGraph.MarkedNodeList;
-//
-//             NodeFactory.SetMarkingComplete();
-//             ObjectWriter.EmitObject(outputFile, nodes, NodeFactory, dumper);
-//         }
-//
-//         protected override void ComputeDependencyNodeDependencies(List<DependencyNodeCore<NodeFactory>> obj)
-//         {
-//             // Determine the list of method we actually need to compile
-//             var methodsToCompile = new List<MethodCodeNode>();
-//             var canonicalMethodsToCompile = new HashSet<MethodDesc>();
-//
-//             foreach (DependencyNodeCore<NodeFactory> dependency in obj)
-//             {
-//                 var methodCodeNodeNeedingCode = dependency as MethodCodeNode;
-//                 if (methodCodeNodeNeedingCode == null)
-//                 {
-//                     // To compute dependencies of the shadow method that tracks dictionary
-//                     // dependencies we need to ensure there is code for the canonical method body.
-//                     var dependencyMethod = (ShadowConcreteMethodNode)dependency;
-//                     methodCodeNodeNeedingCode = (MethodCodeNode)dependencyMethod.CanonicalMethodNode;
-//                 }
-//
-//                 // We might have already queued this method for compilation
-//                 MethodDesc method = methodCodeNodeNeedingCode.Method;
-//                 if (method.IsCanonicalMethod(CanonicalFormKind.Any)
-//                     && !canonicalMethodsToCompile.Add(method))
-//                 {
-//                     continue;
-//                 }
-//
-//                 methodsToCompile.Add(methodCodeNodeNeedingCode);
-//             }
-//
-//             if ((_compilationOptions & RyuJitCompilationOptions.SingleThreadedCompilation) != 0)
-//             {
-//                 CompileSingleThreaded(methodsToCompile);
-//             }
-//             else
-//             {
-//                 CompileMultiThreaded(methodsToCompile);
-//             }
-//         }
-//         private void CompileMultiThreaded(List<MethodCodeNode> methodsToCompile)
-//         {
-//             if (Logger.IsVerbose)
-//             {
-//                 Logger.Writer.WriteLine($"Compiling {methodsToCompile.Count} methods...");
-//             }
-//
-//             WaitCallback compileSingleMethodDelegate = m =>
-//             {
-//                 CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
-//                 CompileSingleMethod(corInfo, (MethodCodeNode)m);
-//             };
-//
-//             using (_compilationCountdown = new CountdownEvent(methodsToCompile.Count))
-//             {
-//
-//                 foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
-//                 {
-//                     ThreadPool.QueueUserWorkItem(compileSingleMethodDelegate, methodCodeNodeNeedingCode);
-//                 }
-//
-//                 _compilationCountdown.Wait();
-//                 _compilationCountdown = null;
-//             }
-//         }
-//
-//
-//         private void CompileSingleThreaded(List<MethodCodeNode> methodsToCompile)
-//         {
-//             CorInfoImpl corInfo = _corinfos.GetValue(Thread.CurrentThread, thread => new CorInfoImpl(this));
-//
-//             foreach (MethodCodeNode methodCodeNodeNeedingCode in methodsToCompile)
-//             {
-//                 if (Logger.IsVerbose)
-//                 {
-//                     Logger.Writer.WriteLine($"Compiling {methodCodeNodeNeedingCode.Method}...");
-//                 }
-//
-//                 CompileSingleMethod(corInfo, methodCodeNodeNeedingCode);
-//             }
-//         }
-//
-//         private void CompileSingleMethod(CorInfoImpl corInfo, MethodCodeNode methodCodeNodeNeedingCode)
-//         {
-//             MethodDesc method = methodCodeNodeNeedingCode.Method;
-//
-//             try
-//             {
-//                 corInfo.CompileMethod(methodCodeNodeNeedingCode);
-//             }
-//             catch (TypeSystemException ex)
-//             {
-//                 // TODO: fail compilation if a switch was passed
-//
-//                 // Try to compile the method again, but with a throwing method body this time.
-//                 MethodIL throwingIL = TypeSystemThrowingILEmitter.EmitIL(method, ex);
-//                 corInfo.CompileMethod(methodCodeNodeNeedingCode, throwingIL);
-//
-//                 // TODO: Log as a warning. For now, just log to the logger; but this needs to
-//                 // have an error code, be supressible, the method name/sig needs to be properly formatted, etc.
-//                 // https://github.com/dotnet/corert/issues/72
-//                 Logger.Writer.WriteLine($"Warning: Method `{method}` will always throw because: {ex.Message}");
-//             }
-//             finally
-//             {
-//                 if (_compilationCountdown != null)
-//                     _compilationCountdown.Signal();
-//             }
-//         }
-//
-//         public override MethodIL GetMethodIL(MethodDesc method)
-//         {
-//             TypeDesc owningType = method.OwningType;
-//             string intrinsicId = InstructionSetSupport.GetHardwareIntrinsicId(TypeSystemContext.Target.Architecture, owningType);
-//             if (!string.IsNullOrEmpty(intrinsicId)
-//                 && HardwareIntrinsicHelpers.IsIsSupportedMethod(method))
-//             {
-//                 InstructionSet instructionSet = _instructionSetMap[intrinsicId];
-//
-//                 // If this is an instruction set that is optimistically supported, but is not one of the
-//                 // intrinsics that are known to be always available, emit IL that checks the support level
-//                 // at runtime.
-//                 if (!InstructionSetSupport.IsInstructionSetSupported(instructionSet)
-//                     && InstructionSetSupport.OptimisticFlags.HasInstructionSet(instructionSet))
-//                 {
-//                     return HardwareIntrinsicHelpers.EmitIsSupportedIL(method, _hardwareIntrinsicFlags);
-//                 }
-//             }
-//
-//             return base.GetMethodIL(method);
-//         }
-//     }
-//
-//     [Flags]
-//     public enum RyuJitCompilationOptions
-//     {
-//         MethodBodyFolding = 0x1,
-//         SingleThreadedCompilation = 0x2,
-//     }
-// }
diff --git a/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj b/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
index 5c2c17506c42..b1e41e99bb00 100644
--- a/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
+++ b/src/coreclr/tools/aot/ILCompiler.LLVM/ILCompiler.LLVM.csproj
@@ -63,7 +63,6 @@
     <Compile Include="CodeGen\EvaluationStack.cs" />
     <Compile Include="CodeGen\NodeDataSection.cs" />
     <Compile Include="CodeGen\ILToLLVMImporter.cs" />
-    <Compile Include="Compiler\RyuJitLLVMCompilation.cs" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
index b4bcf103b52c..d266de7ca997 100644
--- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
+++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.Llvm.cs
@@ -1,37 +1,29 @@
 ﻿using System;
-using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using ILCompiler;
 using Internal.TypeSystem;
 
 namespace Internal.JitInterface
 {
     public unsafe sealed partial class CorInfoImpl
     {
-        [ThreadStatic]
-        private static CorInfoImpl _thisStatic;
-
         [UnmanagedCallersOnly]
         public static byte* getMangledMethodName(IntPtr thisHandle, CORINFO_METHOD_STRUCT_* ftn)
         {
-            //var _this = GetThis(thisHandle); // TODO: this doesn't work, but how does it cope anyway with this being moved by the GC?
+            var _this = GetThis(thisHandle);
 
-            MethodDesc method = _thisStatic.HandleToObject(ftn);
+            MethodDesc method = _this.HandleToObject(ftn);
 
-            return (byte*)_thisStatic.GetPin(_thisStatic._compilation.NameMangler.GetMangledMethodName(method).UnderlyingArray);
+            return (byte*)_this.GetPin(_this._compilation.NameMangler.GetMangledMethodName(method).UnderlyingArray);
         }
 
         [DllImport(JitLibrary)]
         private extern static void registerLlvmCallbacks(IntPtr thisHandle, byte* outputFileName, byte* triple, byte* dataLayout, delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*> getMangedMethodNamePtr);
 
-        public void RegisterLlvmCallbacks(string outputFileName, string triple, string dataLayout)
+        public void RegisterLlvmCallbacks(IntPtr corInfoPtr, string outputFileName, string triple, string dataLayout)
         {
-            CorInfoImpl _this = this;
-            _thisStatic = this;
-
-            registerLlvmCallbacks((IntPtr)Unsafe.AsPointer(ref _this), (byte*)_thisStatic.GetPin(StringToUTF8(outputFileName)),
-                (byte*)_thisStatic.GetPin(StringToUTF8(triple)),
-                (byte*)_thisStatic.GetPin(StringToUTF8(dataLayout)),
+            registerLlvmCallbacks(corInfoPtr, (byte*)GetPin(StringToUTF8(outputFileName)),
+                (byte*)GetPin(StringToUTF8(triple)),
+                (byte*)GetPin(StringToUTF8(dataLayout)),
                 (delegate* unmanaged<IntPtr, CORINFO_METHOD_STRUCT_*, byte*>) &getMangledMethodName);
         }
     }