Skip to content
This repository has been archived by the owner on Nov 1, 2020. It is now read-only.

Port normalized SpinWait from CoreCLR #7569

Merged
merged 4 commits into from
Jul 1, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Native/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ set(COMMON_RUNTIME_SOURCES
thread.cpp
threadstore.cpp
UniversalTransitionHelpers.cpp
yieldprocessornormalized.cpp

# ProjectN only
# gcdump.cpp
Expand Down
1 change: 1 addition & 0 deletions src/Native/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum CrstType
CrstGcStressControl,
CrstSuspendEE,
CrstCastCache,
CrstYieldProcessorNormalized,
};

enum CrstFlags
Expand Down
5 changes: 5 additions & 0 deletions src/Native/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "threadstore.inl"
#include "thread.inl"

#include "yieldprocessornormalized.h"

GPTR_DECL(Thread, g_pFinalizerThread);

CLREventStatic g_FinalizerEvent;
Expand All @@ -50,6 +52,9 @@ UInt32 WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();

// Wait for a finalization request.
UInt32 uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down
5 changes: 3 additions & 2 deletions src/Native/Runtime/MiscHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "Volatile.h"
#include "GCMemoryHelpers.h"
#include "GCMemoryHelpers.inl"
#include "yieldprocessornormalized.h"

COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
{
Expand All @@ -48,8 +49,8 @@ COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
// Busy spin for the given number of iterations.
COOP_PINVOKE_HELPER(void, RhSpinWait, (Int32 iterations))
{
for(int i = 0; i < iterations; i++)
PalYieldProcessor();
YieldProcessorNormalizationInfo normalizationInfo;
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, iterations);
}

// Yield the cpu to another thread ready to process, if one is available.
Expand Down
40 changes: 6 additions & 34 deletions src/Native/Runtime/RWLock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "threadstore.h"
#include "threadstore.inl"
#include "RuntimeInstance.h"
#include "yieldprocessornormalized.h"

// Configurable constants used across our spin locks
// Initialization here is necessary so that we have meaningful values before the runtime is started
Expand Down Expand Up @@ -179,23 +180,8 @@ void ReaderWriterLock::AcquireReadLockWorker()
break;

// Delay by approximately 2*i clock cycles (Pentium III).
// This is brittle code - future processors may of course execute this
// faster or slower, and future code generators may eliminate the loop altogether.
// The precise value of the delay is not critical, however, and I can't think
// of a better way that isn't machine-dependent - petersol.
int sum = 0;
for (int delayCount = uDelay; --delayCount; )
{
sum += delayCount;
PalYieldProcessor(); // indicate to the processor that we are spining
}
if (sum == 0)
{
// never executed, just to fool the compiler into thinking sum is live here,
// so that it won't optimize away the loop.
static char dummy;
dummy++;
}
YieldProcessorNormalizedForPreSkylakeCount(uDelay);

// exponential backoff: wait a factor longer in the next iteration
uDelay *= g_SpinConstants.uBackoffFactor;
}
Expand Down Expand Up @@ -260,24 +246,10 @@ void ReaderWriterLock::AcquireWriteLock()
{
break;
}

// Delay by approximately 2*i clock cycles (Pentium III).
// This is brittle code - future processors may of course execute this
// faster or slower, and future code generators may eliminate the loop altogether.
// The precise value of the delay is not critical, however, and I can't think
// of a better way that isn't machine-dependent - petersol.
int sum = 0;
for (int delayCount = uDelay; --delayCount; )
{
sum += delayCount;
PalYieldProcessor(); // indicate to the processor that we are spining
}
if (sum == 0)
{
// never executed, just to fool the compiler into thinking sum is live here,
// so that it won't optimize away the loop.
static char dummy;
dummy++;
}
YieldProcessorNormalizedForPreSkylakeCount(uDelay);

// exponential backoff: wait a factor longer in the next iteration
uDelay *= g_SpinConstants.uBackoffFactor;
}
Expand Down
3 changes: 3 additions & 0 deletions src/Native/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "RhConfig.h"
#include "stressLog.h"
#include "RestrictedCallouts.h"
#include "yieldprocessornormalized.h"

#ifndef DACCESS_COMPILE

Expand Down Expand Up @@ -114,6 +115,8 @@ static bool InitDLL(HANDLE hPalInstance)
if (!g_ThunkPoolLock.InitNoThrow(CrstType::CrstCastCache))
return false;

InitializeYieldProcessorNormalizedCrst();

return true;
}

Expand Down
5 changes: 3 additions & 2 deletions src/Native/Runtime/threadstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "threadstore.inl"
#include "RuntimeInstance.h"
#include "TargetPtrs.h"
#include "yieldprocessornormalized.h"

#include "slist.inl"
#include "GCMemoryHelpers.h"
Expand Down Expand Up @@ -246,6 +247,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
PalFlushProcessWriteBuffers();

bool keepWaiting;
YieldProcessorNormalizationInfo normalizationInfo;
do
{
keepWaiting = false;
Expand Down Expand Up @@ -283,8 +285,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
// too long (we probably don't need a 15ms wait here). Instead, we'll just burn some
// cycles.
// @TODO: need tuning for spin
for (int i = 0; i < 10000; i++)
PalYieldProcessor();
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000);
}
}

Expand Down
119 changes: 119 additions & 0 deletions src/Native/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#include "common.h"
#include "gcenv.h"
#include "gcheaputilities.h"
#include "CommonTypes.h"
#include "CommonMacros.h"
#include "daccess.h"
#include "DebugMacrosExt.h"
#include "PalRedhawkCommon.h"
#include "PalRedhawk.h"
#include "rhassert.h"
#include "slist.h"
#include "Volatile.h"
#include "yieldprocessornormalized.h"

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

LARGE_INTEGER li;
if (!PalQueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}
ULONGLONG ticksPerSecond = li.QuadPart;

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
PalQueryPerformanceCounter(&li);
ULONGLONG startTicks = li.QuadPart;
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

PalQueryPerformanceCounter(&li);
ULONGLONG nowTicks = li.QuadPart;
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
Loading