From ab3a9f5e95901861dcb5d4b994ba786e2d33274f Mon Sep 17 00:00:00 2001 From: wutno Date: Mon, 16 Oct 2023 07:22:03 -0400 Subject: [PATCH 1/8] winapi: Use rdtsc for QueryPerformanceCounter --- lib/winapi/profiling.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index 7afe200a3..69e0c04b9 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -11,7 +11,7 @@ BOOL QueryPerformanceCounter (LARGE_INTEGER *lpPerformanceCount) { assert(lpPerformanceCount != NULL); - lpPerformanceCount->QuadPart = KeQueryPerformanceCounter(); + lpPerformanceCount->QuadPart = __rdtsc(); return TRUE; } @@ -21,4 +21,4 @@ BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) lpFrequency->QuadPart = KeQueryPerformanceFrequency(); return TRUE; -} +} \ No newline at end of file From 64f650515098d24637900f7d00e40cd46a857db6 Mon Sep 17 00:00:00 2001 From: wutno Date: Mon, 16 Oct 2023 07:24:23 -0400 Subject: [PATCH 2/8] winapi: Follow XDK with QueryPerformanceFrequency by hardcoding 733MHz --- lib/winapi/profiling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index 69e0c04b9..bdea7cc0a 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -19,6 +19,6 @@ BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) { assert(lpFrequency != NULL); - lpFrequency->QuadPart = KeQueryPerformanceFrequency(); + lpFrequency->QuadPart = 733333333; return TRUE; } \ No newline at end of file From 3faa5759a80af810eb8fd7425a4b95ae84d77cee Mon Sep 17 00:00:00 2001 From: wutno Date: Mon, 16 Oct 2023 07:25:24 -0400 Subject: [PATCH 3/8] winapi: Allow dynamic calculation of CPU frequency for QueryPerformanceFrequency --- lib/winapi/profiling.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index bdea7cc0a..6b11ebc8a 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -3,6 +3,9 @@ // SPDX-FileCopyrightText: 2019 Stefan Schmidt #include +#ifdef USE_RDTSC_FOR_FREQ +#include +#endif #include #include #include @@ -19,6 +22,36 @@ BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) { assert(lpFrequency != NULL); +#ifdef USE_RDTSC_FOR_FREQ + #define AVG_SET 10 + ULARGE_INTEGER f_rdtsc, avg; + ULONG f_ticks = 0; + + avg.QuadPart = 0; + + for (int i = 0; i < AVG_SET; i++) { + ULARGE_INTEGER s_rdtsc; + ULONG s_ticks; + + s_rdtsc.QuadPart = __rdtsc(); + s_ticks = KeTickCount; + + s_rdtsc.QuadPart -= f_rdtsc.QuadPart; + s_rdtsc.QuadPart /= s_ticks - f_ticks; + + f_rdtsc.QuadPart = __rdtsc(); + f_ticks = KeTickCount; + + // Skip the first result as invalid + if (i) + avg.QuadPart += s_rdtsc.QuadPart; + + // If we call rdtsc too fast we'll end up with div by 0 + Sleep(10); + } + lpFrequency->QuadPart = (avg.QuadPart / (AVG_SET - 1)) * 1000; +#else lpFrequency->QuadPart = 733333333; +#endif return TRUE; } \ No newline at end of file From 235dde3438a535809814a7102d76603c84fe8566 Mon Sep 17 00:00:00 2001 From: wutno Date: Tue, 17 Oct 2023 12:28:48 -0400 Subject: [PATCH 4/8] winapi: Prime QueryPerformanceFrequency instead of calculating freq every call --- lib/winapi/profiling.c | 49 +++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index 6b11ebc8a..9df232c12 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -10,28 +10,19 @@ #include #include -BOOL QueryPerformanceCounter (LARGE_INTEGER *lpPerformanceCount) -{ - assert(lpPerformanceCount != NULL); - - lpPerformanceCount->QuadPart = __rdtsc(); - return TRUE; -} - -BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) -{ - assert(lpFrequency != NULL); - #ifdef USE_RDTSC_FOR_FREQ - #define AVG_SET 10 - ULARGE_INTEGER f_rdtsc, avg; - ULONG f_ticks = 0; +static LARGE_INTEGER frequency = {0, 0}; +static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () +{ + #define AVG_SET 2 + ULARGE_INTEGER f_rdtsc, avg = {0, 0}, s_rdtsc; + ULONG f_ticks = 0, s_ticks = 0; - avg.QuadPart = 0; + Sleep(500); for (int i = 0; i < AVG_SET; i++) { - ULARGE_INTEGER s_rdtsc; - ULONG s_ticks; + // If we call rdtsc too fast we'll end up with div by 0 + Sleep(200); s_rdtsc.QuadPart = __rdtsc(); s_ticks = KeTickCount; @@ -45,11 +36,25 @@ BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) // Skip the first result as invalid if (i) avg.QuadPart += s_rdtsc.QuadPart; - - // If we call rdtsc too fast we'll end up with div by 0 - Sleep(10); } - lpFrequency->QuadPart = (avg.QuadPart / (AVG_SET - 1)) * 1000; + frequency.QuadPart = avg.QuadPart / (AVG_SET - 1) * 1000LL; +} +#endif + +BOOL QueryPerformanceCounter (LARGE_INTEGER *lpPerformanceCount) +{ + assert(lpPerformanceCount != NULL); + + lpPerformanceCount->QuadPart = __rdtsc(); + return TRUE; +} + +BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) +{ + assert(lpFrequency != NULL); + +#ifdef USE_RDTSC_FOR_FREQ + lpFrequency->QuadPart = frequency.QuadPart; #else lpFrequency->QuadPart = 733333333; #endif From 9a603f7c756b1d85a144e0756d503ec508e25886 Mon Sep 17 00:00:00 2001 From: wutno Date: Tue, 17 Oct 2023 18:31:10 -0400 Subject: [PATCH 5/8] winapi: Squash me --- lib/winapi/profiling.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index 9df232c12..79ef704d9 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -11,33 +11,32 @@ #include #ifdef USE_RDTSC_FOR_FREQ -static LARGE_INTEGER frequency = {0, 0}; +static LARGE_INTEGER frequency = {{0, 0}}; static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () { - #define AVG_SET 2 - ULARGE_INTEGER f_rdtsc, avg = {0, 0}, s_rdtsc; + ULARGE_INTEGER f_rdtsc = {{0, 0}}, s_rdtsc = {{0, 0}}; ULONG f_ticks = 0, s_ticks = 0; - Sleep(500); + KeEnterCriticalRegion(); - for (int i = 0; i < AVG_SET; i++) { - // If we call rdtsc too fast we'll end up with div by 0 - Sleep(200); + // The values generated after launching aren't accurate, give it time to increment... + Sleep(700); - s_rdtsc.QuadPart = __rdtsc(); - s_ticks = KeTickCount; + f_rdtsc.QuadPart = __rdtsc(); + f_ticks = KeTickCount; - s_rdtsc.QuadPart -= f_rdtsc.QuadPart; - s_rdtsc.QuadPart /= s_ticks - f_ticks; + Sleep(200); - f_rdtsc.QuadPart = __rdtsc(); - f_ticks = KeTickCount; + s_rdtsc.QuadPart = __rdtsc(); + s_ticks = KeTickCount; - // Skip the first result as invalid - if (i) - avg.QuadPart += s_rdtsc.QuadPart; - } - frequency.QuadPart = avg.QuadPart / (AVG_SET - 1) * 1000LL; + s_rdtsc.QuadPart -= f_rdtsc.QuadPart; + s_rdtsc.QuadPart /= s_ticks - f_ticks; + + frequency.QuadPart = s_rdtsc.QuadPart; + frequency.QuadPart *= 1000LL; + + KeLeaveCriticalRegion(); } #endif @@ -59,4 +58,4 @@ BOOL QueryPerformanceFrequency (LARGE_INTEGER *lpFrequency) lpFrequency->QuadPart = 733333333; #endif return TRUE; -} \ No newline at end of file +} From 8ea50ce3ddfd8697c9f5c190df4cf7e0dae90c06 Mon Sep 17 00:00:00 2001 From: wutno Date: Wed, 18 Oct 2023 15:28:03 -0400 Subject: [PATCH 6/8] winapi: More verbose private variable naming --- lib/winapi/profiling.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index 79ef704d9..a42daf7b5 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -14,26 +14,26 @@ static LARGE_INTEGER frequency = {{0, 0}}; static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () { - ULARGE_INTEGER f_rdtsc = {{0, 0}}, s_rdtsc = {{0, 0}}; - ULONG f_ticks = 0, s_ticks = 0; + ULARGE_INTEGER start_rdtsc = {{0, 0}}, end_rdtsc = {{0, 0}}; + ULONG start_ticks = 0, end_ticks = 0; KeEnterCriticalRegion(); // The values generated after launching aren't accurate, give it time to increment... Sleep(700); - f_rdtsc.QuadPart = __rdtsc(); - f_ticks = KeTickCount; + start_rdtsc.QuadPart = __rdtsc(); + start_ticks = KeTickCount; Sleep(200); - s_rdtsc.QuadPart = __rdtsc(); - s_ticks = KeTickCount; + end_rdtsc.QuadPart = __rdtsc(); + end_ticks = KeTickCount; - s_rdtsc.QuadPart -= f_rdtsc.QuadPart; - s_rdtsc.QuadPart /= s_ticks - f_ticks; + end_rdtsc.QuadPart -= start_rdtsc.QuadPart; + end_rdtsc.QuadPart /= end_ticks - start_ticks; - frequency.QuadPart = s_rdtsc.QuadPart; + frequency.QuadPart = end_rdtsc.QuadPart; frequency.QuadPart *= 1000LL; KeLeaveCriticalRegion(); From eb4385bb2d987bd07513bde4cc4567034963643c Mon Sep 17 00:00:00 2001 From: wutno Date: Tue, 24 Oct 2023 05:53:31 -0400 Subject: [PATCH 7/8] winapi: Move to assembly --- lib/winapi/profiling.c | 89 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 13 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index a42daf7b5..e2b735060 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -14,29 +14,92 @@ static LARGE_INTEGER frequency = {{0, 0}}; static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () { - ULARGE_INTEGER start_rdtsc = {{0, 0}}, end_rdtsc = {{0, 0}}; - ULONG start_ticks = 0, end_ticks = 0; + #define BASE_CLOCK_FLOAT 16.666667f + #define NV_PRAMDAC_PLL_COEFF *(volatile ULONG *)0xFD680500 + #define NV_PTIMER_NUM *(volatile ULONG *)0xFD009200 + #define NV_PTIMER_DEN *(volatile ULONG *)0xFD009210 + #define NV_PTIMER_COUNT 0xFD009400 + #define ASM_LOOPS 1024 * 4 + + ULARGE_INTEGER rdtsc_count_1 = {{0, 0}}, rdtsc_count_2 = {{0, 0}}; + DWORD ptimer_count_1 = 0, ptimer_count_2 = 0; + + // Precalcuate NVCLK & PTIMER freq + double nv_clock = BASE_CLOCK_FLOAT * ((NV_PRAMDAC_PLL_COEFF & 0xFF00) >> 8); + nv_clock /= 1 << ((NV_PRAMDAC_PLL_COEFF & 0x70000) >> 16); + nv_clock /= NV_PRAMDAC_PLL_COEFF & 0xFF; + + double ptimer_frequency = (nv_clock / NV_PTIMER_NUM) * NV_PTIMER_DEN; KeEnterCriticalRegion(); - // The values generated after launching aren't accurate, give it time to increment... - Sleep(700); + __asm + { + push eax + push edx + push ecx + + cli + sfence + + // Turn off caches + mov eax, cr0 + or eax, 1 << 30 // Set CD bit + mov cr0, eax + wbinvd - start_rdtsc.QuadPart = __rdtsc(); - start_ticks = KeTickCount; + // Reset PTIMER + mov eax, [NV_PTIMER_COUNT] + and eax, ~(0xFFFFFFE0) // First 5 bits are not used + mov [NV_PTIMER_COUNT], eax - Sleep(200); + rdtsc + mov rdtsc_count_1.LowPart, eax + mov rdtsc_count_1.HighPart, edx + + mov eax, [NV_PTIMER_COUNT] + mov ptimer_count_1, eax - end_rdtsc.QuadPart = __rdtsc(); - end_ticks = KeTickCount; + // Spin for a bit + mov eax, ASM_LOOPS + loop_1: + dec eax + jnz loop_1 - end_rdtsc.QuadPart -= start_rdtsc.QuadPart; - end_rdtsc.QuadPart /= end_ticks - start_ticks; + rdtsc + mov rdtsc_count_2.LowPart, eax + mov rdtsc_count_2.HighPart, edx + + mov eax, [NV_PTIMER_COUNT] + mov ptimer_count_2, eax - frequency.QuadPart = end_rdtsc.QuadPart; - frequency.QuadPart *= 1000LL; + // Without this, invaldidating the cache below will crash the system + sfence + mov eax, cr0 + and eax, ~(1 << 30) // Clear CD bit + mov cr0, eax + wbinvd + + sti + + pop ecx + pop edx + pop eax + } KeLeaveCriticalRegion(); + + double ptimer_diff = (ptimer_count_2 >> 5) - (ptimer_count_1 >> 5); + double rdtsc_diff = rdtsc_count_2.QuadPart - rdtsc_count_1.QuadPart; + + double ptimer_scale = ptimer_diff / ptimer_frequency; + double cpu_freq_float = rdtsc_diff / ptimer_scale; + + if (!cpu_freq_float) { + frequency.QuadPart = 733333333; + } else { + frequency.QuadPart = (ULONG)(cpu_freq_float * 1000 * 1000); + } } #endif From ef0e000ebd35057317e7edae8894adc6540d7a82 Mon Sep 17 00:00:00 2001 From: wutno Date: Tue, 24 Oct 2023 06:07:38 -0400 Subject: [PATCH 8/8] winapi: Less assembly, good enough(tm) --- lib/winapi/profiling.c | 65 ++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/lib/winapi/profiling.c b/lib/winapi/profiling.c index e2b735060..df434f2b0 100644 --- a/lib/winapi/profiling.c +++ b/lib/winapi/profiling.c @@ -15,11 +15,11 @@ static LARGE_INTEGER frequency = {{0, 0}}; static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () { #define BASE_CLOCK_FLOAT 16.666667f - #define NV_PRAMDAC_PLL_COEFF *(volatile ULONG *)0xFD680500 - #define NV_PTIMER_NUM *(volatile ULONG *)0xFD009200 - #define NV_PTIMER_DEN *(volatile ULONG *)0xFD009210 - #define NV_PTIMER_COUNT 0xFD009400 - #define ASM_LOOPS 1024 * 4 + #define NV_PRAMDAC_PLL_COEFF *(volatile ULONG*)0xFD680500 + #define NV_PTIMER_NUM *(volatile ULONG*)0xFD009200 + #define NV_PTIMER_DEN *(volatile ULONG*)0xFD009210 + #define NV_PTIMER_COUNT *(volatile ULONG*)0xFD009400 + #define KE_STALL 10 ULARGE_INTEGER rdtsc_count_1 = {{0, 0}}, rdtsc_count_2 = {{0, 0}}; DWORD ptimer_count_1 = 0, ptimer_count_2 = 0; @@ -33,60 +33,39 @@ static void __attribute__((constructor)) PrimeQueryPerformanceFrequency () KeEnterCriticalRegion(); + // Turn off caches __asm { - push eax - push edx - push ecx - cli sfence - - // Turn off caches mov eax, cr0 or eax, 1 << 30 // Set CD bit mov cr0, eax wbinvd - // Reset PTIMER - mov eax, [NV_PTIMER_COUNT] - and eax, ~(0xFFFFFFE0) // First 5 bits are not used - mov [NV_PTIMER_COUNT], eax - - rdtsc - mov rdtsc_count_1.LowPart, eax - mov rdtsc_count_1.HighPart, edx - - mov eax, [NV_PTIMER_COUNT] - mov ptimer_count_1, eax - - // Spin for a bit - mov eax, ASM_LOOPS - loop_1: - dec eax - jnz loop_1 - - rdtsc - mov rdtsc_count_2.LowPart, eax - mov rdtsc_count_2.HighPart, edx - - mov eax, [NV_PTIMER_COUNT] - mov ptimer_count_2, eax - - // Without this, invaldidating the cache below will crash the system - sfence + } + + // Reset the counter + NV_PTIMER_COUNT &= ~(0xFFFFFFE0); // First 5 bits are not used + + rdtsc_count_1.QuadPart = __rdtsc(); + ptimer_count_1 = NV_PTIMER_COUNT; + + KeStallExecutionProcessor(KE_STALL); + rdtsc_count_2.QuadPart = __rdtsc(); + ptimer_count_2 = NV_PTIMER_COUNT; + + __asm + { + sfence mov eax, cr0 and eax, ~(1 << 30) // Clear CD bit mov cr0, eax wbinvd - sti - - pop ecx - pop edx - pop eax } + KeLeaveCriticalRegion(); double ptimer_diff = (ptimer_count_2 >> 5) - (ptimer_count_1 >> 5);