dotnet · jkotas · Jul 1, 2019 · Jun 29, 2019 · Jun 29, 2019 · Jun 29, 2019
diff --git a/src/Native/Runtime/CMakeLists.txt b/src/Native/Runtime/CMakeLists.txt
@@ -40,6 +40,7 @@ set(COMMON_RUNTIME_SOURCES
     thread.cpp
     threadstore.cpp
     UniversalTransitionHelpers.cpp
+    yieldprocessornormalized.cpp
 
     # ProjectN only
     # gcdump.cpp

diff --git a/src/Native/Runtime/Crst.h b/src/Native/Runtime/Crst.h
@@ -22,6 +22,7 @@ enum CrstType
     CrstGcStressControl,
     CrstSuspendEE,
     CrstCastCache,
+    CrstYieldProcessorNormalized,
 };
 
 enum CrstFlags

diff --git a/src/Native/Runtime/FinalizerHelpers.cpp b/src/Native/Runtime/FinalizerHelpers.cpp
@@ -24,6 +24,8 @@
 #include "threadstore.inl"
 #include "thread.inl"
 
+#include "yieldprocessornormalized.h"
+
 GPTR_DECL(Thread, g_pFinalizerThread);
 
 CLREventStatic g_FinalizerEvent;
@@ -50,6 +52,9 @@ UInt32 WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    // We have some time until the first finalization request - use the time to calibrate normalized waits.
+    EnsureYieldProcessorNormalizedInitialized();
+
     // Wait for a finalization request.
     UInt32 uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);

diff --git a/src/Native/Runtime/MiscHelpers.cpp b/src/Native/Runtime/MiscHelpers.cpp
@@ -39,6 +39,7 @@
 #include "Volatile.h"
 #include "GCMemoryHelpers.h"
 #include "GCMemoryHelpers.inl"
+#include "yieldprocessornormalized.h"
 
 COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
 {
@@ -48,8 +49,8 @@ COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
 // Busy spin for the given number of iterations.
 COOP_PINVOKE_HELPER(void, RhSpinWait, (Int32 iterations))
 {
-    for(int i = 0; i < iterations; i++)
-        PalYieldProcessor();
+    YieldProcessorNormalizationInfo normalizationInfo;
+    YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, iterations);
 }
 
 // Yield the cpu to another thread ready to process, if one is available.

diff --git a/src/Native/Runtime/RWLock.cpp b/src/Native/Runtime/RWLock.cpp
@@ -25,6 +25,7 @@
 #include "threadstore.h"
 #include "threadstore.inl"
 #include "RuntimeInstance.h"
+#include "yieldprocessornormalized.h"
 
 // Configurable constants used across our spin locks
 // Initialization here is necessary so that we have meaningful values before the runtime is started
@@ -179,23 +180,8 @@ void ReaderWriterLock::AcquireReadLockWorker()
                 break;
 
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent - petersol.
-            int sum = 0;
-            for (int delayCount = uDelay; --delayCount; ) 
-            {
-                sum += delayCount;
-                PalYieldProcessor();           // indicate to the processor that we are spining 
-            }
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(uDelay);
+
             // exponential backoff: wait a factor longer in the next iteration
             uDelay *= g_SpinConstants.uBackoffFactor;
         }
@@ -260,24 +246,10 @@ void ReaderWriterLock::AcquireWriteLock()
             {
                 break;
             }
+
             // Delay by approximately 2*i clock cycles (Pentium III).
-            // This is brittle code - future processors may of course execute this
-            // faster or slower, and future code generators may eliminate the loop altogether.
-            // The precise value of the delay is not critical, however, and I can't think
-            // of a better way that isn't machine-dependent - petersol.
-            int sum = 0;
-            for (int delayCount = uDelay; --delayCount; ) 
-            {
-                sum += delayCount;
-                PalYieldProcessor();           // indicate to the processor that we are spining 
-            }
-            if (sum == 0)
-            {
-                // never executed, just to fool the compiler into thinking sum is live here,
-                // so that it won't optimize away the loop.
-                static char dummy;
-                dummy++;
-            }
+            YieldProcessorNormalizedForPreSkylakeCount(uDelay);
+
             // exponential backoff: wait a factor longer in the next iteration
             uDelay *= g_SpinConstants.uBackoffFactor;
         }

diff --git a/src/Native/Runtime/startup.cpp b/src/Native/Runtime/startup.cpp
@@ -26,6 +26,7 @@
 #include "RhConfig.h"
 #include "stressLog.h"
 #include "RestrictedCallouts.h"
+#include "yieldprocessornormalized.h"
 
 #ifndef DACCESS_COMPILE
 
@@ -114,6 +115,8 @@ static bool InitDLL(HANDLE hPalInstance)
     if (!g_ThunkPoolLock.InitNoThrow(CrstType::CrstCastCache))
         return false;
 
+    InitializeYieldProcessorNormalizedCrst();
+
     return true;
 }
 

diff --git a/src/Native/Runtime/threadstore.cpp b/src/Native/Runtime/threadstore.cpp
@@ -23,6 +23,7 @@
 #include "threadstore.inl"
 #include "RuntimeInstance.h"
 #include "TargetPtrs.h"
+#include "yieldprocessornormalized.h"
 
 #include "slist.inl"
 #include "GCMemoryHelpers.h"
@@ -246,6 +247,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
     PalFlushProcessWriteBuffers();
 
     bool keepWaiting;
+    YieldProcessorNormalizationInfo normalizationInfo;
     do
     {
         keepWaiting = false;
@@ -283,8 +285,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
                 // too long (we probably don't need a 15ms wait here).  Instead, we'll just burn some
                 // cycles.
     	        // @TODO: need tuning for spin
-                for (int i = 0; i < 10000; i++)
-                    PalYieldProcessor();
+                YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000);
             }
         }
 

diff --git a/src/Native/Runtime/yieldprocessornormalized.cpp b/src/Native/Runtime/yieldprocessornormalized.cpp
@@ -0,0 +1,119 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "common.h"
+#include "gcenv.h"
+#include "gcheaputilities.h"
+#include "CommonTypes.h"
+#include "CommonMacros.h"
+#include "daccess.h"
+#include "DebugMacrosExt.h"
+#include "PalRedhawkCommon.h"
+#include "PalRedhawk.h"
+#include "rhassert.h"
+#include "slist.h"
+#include "Volatile.h"
+#include "yieldprocessornormalized.h"
+
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+void InitializeYieldProcessorNormalizedCrst()
+{
+    WRAPPER_NO_CONTRACT;
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
+}
+
+static void InitializeYieldProcessorNormalized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (s_isYieldProcessorNormalizedInitialized)
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int MeasureDurationMs = 10;
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    LARGE_INTEGER li;
+    if (!PalQueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_isYieldProcessorNormalizedInitialized = true;
+        return;
+    }
+    ULONGLONG ticksPerSecond = li.QuadPart;
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+    PalQueryPerformanceCounter(&li);
+    ULONGLONG startTicks = li.QuadPart;
+    ULONGLONG elapsedTicks;
+    do
+    {
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
+        {
+            System_YieldProcessor();
+        }
+        yieldCount += 1000;
+
+        PalQueryPerformanceCounter(&li);
+        ULONGLONG nowTicks = li.QuadPart;
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
+    // value is naturally limited to MinNsPerNormalizedYield.
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
+}