Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Move initialization of YieldProcessorNormalized to the finalizer thread #14058

Merged
merged 2 commits into from
Sep 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/mscorlib/src/Internal/Runtime/Augments/RuntimeThread.cs
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ internal static int OptimalMaxSpinWaitsPerSpinIteration
}

// This is done lazily because the first call to the function below in the process triggers a measurement that
// takes a nontrivial amount of time. See Thread::InitializeYieldProcessorNormalized(), which describes and
// calculates this value.
// takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
// See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
return s_optimalMaxSpinWaitsPerSpinIteration;
Expand Down
14 changes: 8 additions & 6 deletions src/vm/comsynchronizable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1632,8 +1632,9 @@ INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()

BEGIN_QCALL;

Thread::EnsureYieldProcessorNormalizedInitialized();
optimalMaxNormalizedYieldsPerSpinIteration = Thread::GetOptimalMaxNormalizedYieldsPerSpinIteration();
// RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
EnsureYieldProcessorNormalizedInitialized();
optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;

END_QCALL;

Expand All @@ -1655,10 +1656,11 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
// spinning for less than that number of cycles, then switching to preemptive
// mode won't help a GC start any faster.
//
if (iterations <= 100000 && Thread::IsYieldProcessorNormalizedInitialized())
if (iterations <= 100000)
{
YieldProcessorNormalizationInfo normalizationInfo;
for (int i = 0; i < iterations; i++)
Thread::YieldProcessorNormalized();
YieldProcessorNormalized(normalizationInfo);
return;
}

Expand All @@ -1668,9 +1670,9 @@ FCIMPL1(void, ThreadNative::SpinWait, int iterations)
HELPER_METHOD_FRAME_BEGIN_NOPOLL();
GCX_PREEMP();

Thread::EnsureYieldProcessorNormalizedInitialized();
YieldProcessorNormalizationInfo normalizationInfo;
for (int i = 0; i < iterations; i++)
Thread::YieldProcessorNormalized();
YieldProcessorNormalized(normalizationInfo);

HELPER_METHOD_FRAME_END();
}
Expand Down
2 changes: 2 additions & 0 deletions src/vm/finalizerthread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,8 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
#endif
GetFinalizerThread()->SetBackground(TRUE);

EnsureYieldProcessorNormalizedInitialized();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I understanding the change correctly that until the finalizer thread gets to this point, spinning will be done using a default value, and then once this runs, that value will be updated appropriately based on the system? I think that makes sense, just wanted to confirm.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's right


#ifdef FEATURE_PROFAPI_ATTACH_DETACH
// Add the Profiler Attach Event to the array of event handles that the
// finalizer thread waits on. If the process is not enabled for profiler
Expand Down
47 changes: 33 additions & 14 deletions src/vm/threads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ SPTR_IMPL(ThreadStore, ThreadStore, s_pThreadStore);
CONTEXT *ThreadStore::s_pOSContext = NULL;
CLREvent *ThreadStore::s_pWaitForStackCrawlEvent;

static CrstStatic s_initializeYieldProcessorNormalizedCrst;

#ifndef DACCESS_COMPILE


Expand Down Expand Up @@ -1363,7 +1365,7 @@ void InitThreadManager()
}
CONTRACTL_END;

Thread::s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);

// All patched helpers should fit into one page.
// If you hit this assert on retail build, there is most likely problem with BBT script.
Expand Down Expand Up @@ -11747,25 +11749,29 @@ ULONGLONG Thread::QueryThreadProcessorUsage()
}
#endif // FEATURE_APPDOMAIN_RESOURCE_MONITORING

CrstStatic Thread::s_initializeYieldProcessorNormalizedCrst;
int Thread::s_yieldsPerNormalizedYield = 0;
int Thread::s_optimalMaxNormalizedYieldsPerSpinIteration = 0;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// YieldProcessorNormalized

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this would be 9 for pre-Skylake
int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;

void Thread::InitializeYieldProcessorNormalized()
void InitializeYieldProcessorNormalized()
{
LIMITED_METHOD_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (IsYieldProcessorNormalizedInitialized())
if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int DefaultYieldsPerNormalizedYield = 1; // defaults are for when no measurement is done
const int DefaultOptimalMaxNormalizedYieldsPerSpinIteration = 64; // tuned for pre-Skylake processors, for post-Skylake it should be 7
const int MeasureDurationMs = 10;
const int MaxYieldsPerNormalizedYield = 10; // measured typically 8-9 on pre-Skylake
const int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
Expand All @@ -11776,8 +11782,7 @@ void Thread::InitializeYieldProcessorNormalized()
if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_yieldsPerNormalizedYield = DefaultYieldsPerNormalizedYield;
s_optimalMaxNormalizedYieldsPerSpinIteration = DefaultOptimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;
return;
}
ULONGLONG ticksPerSecond = li.QuadPart;
Expand All @@ -11790,11 +11795,14 @@ void Thread::InitializeYieldProcessorNormalized()
ULONGLONG elapsedTicks;
do
{
for (int i = 0; i < 10; ++i)
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
YieldProcessor();
}
yieldCount += 10;
yieldCount += 1000;

QueryPerformanceCounter(&li);
ULONGLONG nowTicks = li.QuadPart;
Expand Down Expand Up @@ -11827,6 +11835,17 @@ void Thread::InitializeYieldProcessorNormalized()
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
s_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
137 changes: 72 additions & 65 deletions src/vm/threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -5362,71 +5362,6 @@ class Thread: public IUnknown
m_HijackReturnKind = returnKind;
}
#endif // FEATURE_HIJACK

private:
static CrstStatic s_initializeYieldProcessorNormalizedCrst;
static int s_yieldsPerNormalizedYield;
static int s_optimalMaxNormalizedYieldsPerSpinIteration;

private:
static void InitializeYieldProcessorNormalized();

public:
static bool IsYieldProcessorNormalizedInitialized()
{
LIMITED_METHOD_CONTRACT;
return s_yieldsPerNormalizedYield != 0 && s_optimalMaxNormalizedYieldsPerSpinIteration != 0;
}

public:
static void EnsureYieldProcessorNormalizedInitialized()
{
LIMITED_METHOD_CONTRACT;

if (!IsYieldProcessorNormalizedInitialized())
{
InitializeYieldProcessorNormalized();
}
}

public:
static int GetOptimalMaxNormalizedYieldsPerSpinIteration()
{
WRAPPER_NO_CONTRACT;
_ASSERTE(IsYieldProcessorNormalizedInitialized());

return s_optimalMaxNormalizedYieldsPerSpinIteration;
}

public:
static void YieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;
_ASSERTE(IsYieldProcessorNormalizedInitialized());

int n = s_yieldsPerNormalizedYield;
while (--n >= 0)
{
YieldProcessor();
}
}

static void YieldProcessorNormalizedWithBackOff(unsigned int spinIteration)
{
WRAPPER_NO_CONTRACT;
_ASSERTE(IsYieldProcessorNormalizedInitialized());

int n = s_optimalMaxNormalizedYieldsPerSpinIteration;
if (spinIteration <= 30 && (1 << spinIteration) < n)
{
n = 1 << spinIteration;
}
n *= s_yieldsPerNormalizedYield;
while (--n >= 0)
{
YieldProcessor();
}
}
};

// End of class Thread
Expand Down Expand Up @@ -7573,4 +7508,76 @@ class ThreadStateNCStackHolder

BOOL Debug_IsLockedViaThreadSuspension();

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// YieldProcessorNormalized

extern int g_yieldsPerNormalizedYield;
extern int g_optimalMaxNormalizedYieldsPerSpinIteration;

void InitializeYieldProcessorNormalized();
void EnsureYieldProcessorNormalizedInitialized();

class YieldProcessorNormalizationInfo
{
private:
int yieldsPerNormalizedYield;

public:
YieldProcessorNormalizationInfo() : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield)
{
}

friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
};

FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
{
LIMITED_METHOD_CONTRACT;

int n = normalizationInfo.yieldsPerNormalizedYield;
while (--n >= 0)
{
YieldProcessor();
}
}

class YieldProcessorWithBackOffNormalizationInfo
{
private:
int yieldsPerNormalizedYield;
int optimalMaxNormalizedYieldsPerSpinIteration;
int optimalMaxYieldsPerSpinIteration;

public:
YieldProcessorWithBackOffNormalizationInfo()
: yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
{
}

friend void YieldProcessorWithBackOffNormalized(const YieldProcessorWithBackOffNormalizationInfo &, unsigned int);
};

FORCEINLINE void YieldProcessorWithBackOffNormalized(
const YieldProcessorWithBackOffNormalizationInfo &normalizationInfo,
unsigned int spinIteration)
{
LIMITED_METHOD_CONTRACT;

int n;
if (spinIteration <= 30 && (1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
{
n = (1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
}
else
{
n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
}
while (--n >= 0)
{
YieldProcessor();
}
}

#endif //__threads_h__