Skip to content
This repository has been archived by the owner on Nov 1, 2020. It is now read-only.

Port normalized SpinWait from CoreCLR #7569

Merged
merged 4 commits into from
Jul 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Native/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ set(COMMON_RUNTIME_SOURCES
thread.cpp
threadstore.cpp
UniversalTransitionHelpers.cpp
yieldprocessornormalized.cpp

# ProjectN only
# gcdump.cpp
Expand Down
1 change: 1 addition & 0 deletions src/Native/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ enum CrstType
CrstGcStressControl,
CrstSuspendEE,
CrstCastCache,
CrstYieldProcessorNormalized,
};

enum CrstFlags
Expand Down
5 changes: 5 additions & 0 deletions src/Native/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "threadstore.inl"
#include "thread.inl"

#include "yieldprocessornormalized.h"

GPTR_DECL(Thread, g_pFinalizerThread);

CLREventStatic g_FinalizerEvent;
Expand All @@ -50,6 +52,9 @@ UInt32 WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();

// Wait for a finalization request.
UInt32 uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down
5 changes: 3 additions & 2 deletions src/Native/Runtime/MiscHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "Volatile.h"
#include "GCMemoryHelpers.h"
#include "GCMemoryHelpers.inl"
#include "yieldprocessornormalized.h"

COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
{
Expand All @@ -48,8 +49,8 @@ COOP_PINVOKE_HELPER(void, RhDebugBreak, ())
// Busy spin for the given number of iterations.
COOP_PINVOKE_HELPER(void, RhSpinWait, (Int32 iterations))
{
for(int i = 0; i < iterations; i++)
PalYieldProcessor();
YieldProcessorNormalizationInfo normalizationInfo;
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, iterations);
}

// Yield the cpu to another thread ready to process, if one is available.
Expand Down
40 changes: 6 additions & 34 deletions src/Native/Runtime/RWLock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "threadstore.h"
#include "threadstore.inl"
#include "RuntimeInstance.h"
#include "yieldprocessornormalized.h"

// Configurable constants used across our spin locks
// Initialization here is necessary so that we have meaningful values before the runtime is started
Expand Down Expand Up @@ -179,23 +180,8 @@ void ReaderWriterLock::AcquireReadLockWorker()
break;

// Delay by approximately 2*i clock cycles (Pentium III).
// This is brittle code - future processors may of course execute this
// faster or slower, and future code generators may eliminate the loop altogether.
// The precise value of the delay is not critical, however, and I can't think
// of a better way that isn't machine-dependent - petersol.
int sum = 0;
for (int delayCount = uDelay; --delayCount; )
{
sum += delayCount;
PalYieldProcessor(); // indicate to the processor that we are spining
}
if (sum == 0)
{
// never executed, just to fool the compiler into thinking sum is live here,
// so that it won't optimize away the loop.
static char dummy;
dummy++;
}
YieldProcessorNormalizedForPreSkylakeCount(uDelay);

// exponential backoff: wait a factor longer in the next iteration
uDelay *= g_SpinConstants.uBackoffFactor;
}
Expand Down Expand Up @@ -260,24 +246,10 @@ void ReaderWriterLock::AcquireWriteLock()
{
break;
}

// Delay by approximately 2*i clock cycles (Pentium III).
// This is brittle code - future processors may of course execute this
// faster or slower, and future code generators may eliminate the loop altogether.
// The precise value of the delay is not critical, however, and I can't think
// of a better way that isn't machine-dependent - petersol.
int sum = 0;
for (int delayCount = uDelay; --delayCount; )
{
sum += delayCount;
PalYieldProcessor(); // indicate to the processor that we are spining
}
if (sum == 0)
{
// never executed, just to fool the compiler into thinking sum is live here,
// so that it won't optimize away the loop.
static char dummy;
dummy++;
}
YieldProcessorNormalizedForPreSkylakeCount(uDelay);

// exponential backoff: wait a factor longer in the next iteration
uDelay *= g_SpinConstants.uBackoffFactor;
}
Expand Down
3 changes: 3 additions & 0 deletions src/Native/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "RhConfig.h"
#include "stressLog.h"
#include "RestrictedCallouts.h"
#include "yieldprocessornormalized.h"

#ifndef DACCESS_COMPILE

Expand Down Expand Up @@ -114,6 +115,8 @@ static bool InitDLL(HANDLE hPalInstance)
if (!g_ThunkPoolLock.InitNoThrow(CrstType::CrstCastCache))
return false;

InitializeYieldProcessorNormalizedCrst();

return true;
}

Expand Down
5 changes: 3 additions & 2 deletions src/Native/Runtime/threadstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "threadstore.inl"
#include "RuntimeInstance.h"
#include "TargetPtrs.h"
#include "yieldprocessornormalized.h"

#include "slist.inl"
#include "GCMemoryHelpers.h"
Expand Down Expand Up @@ -246,6 +247,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
PalFlushProcessWriteBuffers();

bool keepWaiting;
YieldProcessorNormalizationInfo normalizationInfo;
do
{
keepWaiting = false;
Expand Down Expand Up @@ -283,8 +285,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent, bool fireDebugEvent)
// too long (we probably don't need a 15ms wait here). Instead, we'll just burn some
// cycles.
// @TODO: need tuning for spin
for (int i = 0; i < 10000; i++)
PalYieldProcessor();
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000);
}
}

Expand Down
121 changes: 121 additions & 0 deletions src/Native/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#include "common.h"
#include "gcenv.h"
#include "gcheaputilities.h"
#include "CommonTypes.h"
#include "CommonMacros.h"
#include "daccess.h"
#include "DebugMacrosExt.h"
#include "PalRedhawkCommon.h"
#include "PalRedhawk.h"
#include "rhassert.h"
#include "slist.h"
#include "Volatile.h"
#include "yieldprocessornormalized.h"

#define ULONGLONG int64_t

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

LARGE_INTEGER li;
if (!PalQueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}
ULONGLONG ticksPerSecond = li.QuadPart;

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
PalQueryPerformanceCounter(&li);
ULONGLONG startTicks = li.QuadPart;
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

PalQueryPerformanceCounter(&li);
ULONGLONG nowTicks = li.QuadPart;
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
Loading