Linux tested

ad8e · Feb 13, 2023 · 4816950 · 4816950
1 parent 44685c6
commit 4816950
Show file tree

Hide file tree

Showing 21 changed files with 7,182 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,3 @@
+Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/README.md b/README.md
@@ -1 +1,15 @@
-# vsync_blurbusters
+Mark Rejhon wants to create a cross-platform API for finding the vsync timings. This is the first step along that path. The demo is in `render_vsync_demo.cpp`.
+
+The platform APIs are in `platform_vsync_linux.cpp` and `platform_vsync_windows.cpp`.
+
+`vsync.cpp` turns a stream of timepoints from a wakeup thread into a period and phase pair, and is seriously complex. `vsync_with_scanline.cpp` turns a stream of accurate scanlines into a period and phase pair, and is simple linear regression. If your platform gives you the vsync period exactly, you don't need either of these.
+
+The other files are helper files which you can ignore.
+
+It works on Linux, using OML to get the vsync timepoint.
+
+It works on Windows, using either scanlines or waiting. It's not clear which is preferred. On Intel GPUs, scanlines are better. On Nvidia, scanlines may have problems.
+
+Guide:
+1. install GLFW. On Linux, that's `sudo apt install libglfw3-dev`
+2. Compile. On Linux, that's `g++ render_vsync_demo.cpp -std=c++20 -lGL -lglfw -lXrandr -Ij -lpthread -O2`
diff --git a/d3dkmthk_fake.h b/d3dkmthk_fake.h
@@ -0,0 +1,46 @@
+#pragma once
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+//copy pasted from the original source.
+//https://github.com/tpn/winsdk-10/blob/master/Include/10.0.16299.0/km/d3dkmthk.h
+//https://github.com/tpn/winsdk-10/blob/master/Include/10.0.16299.0/shared/d3dukmdt.h
+//https://github.com/tpn/winsdk-10/blob/master/Include/10.0.16299.0/shared/d3dkmdt.h
+
+#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
+
+typedef UINT D3DKMT_HANDLE;
+typedef UINT D3DDDI_VIDEO_PRESENT_SOURCE_ID;
+typedef long NTSTATUS;
+
+typedef struct _D3DKMDT_VIDEO_PRESENT_SOURCE {
+	// Unique ID used to reference the respective video present source by the miniport and the OS.
+	D3DDDI_VIDEO_PRESENT_SOURCE_ID Id;
+
+	// Other video present source descriptor properties go here.
+	DWORD dwReserved;
+} D3DKMDT_VIDEO_PRESENT_SOURCE;
+
+typedef struct _D3DKMT_WAITFORVERTICALBLANKEVENT {
+	D3DKMT_HANDLE hAdapter; // in: adapter handle
+	D3DKMT_HANDLE hDevice; // in: device handle [Optional]
+	D3DDDI_VIDEO_PRESENT_SOURCE_ID VidPnSourceId; // in: adapter's VidPN Source ID
+} D3DKMT_WAITFORVERTICALBLANKEVENT;
+
+typedef struct _D3DKMT_GETSCANLINE {
+	D3DKMT_HANDLE hAdapter; // in: Adapter handle
+	D3DDDI_VIDEO_PRESENT_SOURCE_ID VidPnSourceId; // in: Adapter's VidPN Source ID
+	BOOLEAN InVerticalBlank; // out: Within vertical blank
+	UINT ScanLine; // out: Current scan line
+} D3DKMT_GETSCANLINE;
+
+//msdn is completely wrong about this
+typedef struct _D3DKMT_OPENADAPTERFROMHDC {
+	HDC hDc; // in:  DC that maps to a single display
+	D3DKMT_HANDLE hAdapter; // out: adapter handle
+	LUID AdapterLuid; // out: adapter LUID
+	D3DDDI_VIDEO_PRESENT_SOURCE_ID VidPnSourceId; // out: VidPN source ID for that particular display
+} D3DKMT_OPENADAPTERFROMHDC;
+
+EXTERN_C _Check_return_ NTSTATUS APIENTRY D3DKMTOpenAdapterFromHdc(_Inout_ D3DKMT_OPENADAPTERFROMHDC*);
+EXTERN_C _Check_return_ NTSTATUS APIENTRY D3DKMTWaitForVerticalBlankEvent(_In_ CONST D3DKMT_WAITFORVERTICALBLANKEVENT*);
+EXTERN_C _Check_return_ NTSTATUS APIENTRY D3DKMTGetScanLine(_Inout_ D3DKMT_GETSCANLINE*);
diff --git a/div_floor.h b/div_floor.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <cstdint>
+
+//future: https://stackoverflow.com/a/30824434 is probably better.
+
+//division rounding to positive infinity. positive numbers only please
+uint64_t div_ceil(uint64_t a, uint64_t b) {
+	//return a/b + (a % b != 0); //https://stackoverflow.com/a/2745763
+	return (a + b - 1) / b;
+};
+
+//division rounding to positive infinity
+int64_t div_ceil(int64_t a, uint64_t b) {
+	int64_t b_ = b; //otherwise we get killed by sign conversions
+	return int64_t(a / b_) + int64_t(a % b_ > 0); //https://stackoverflow.com/a/30824434
+};
+
+//division rounding to negative infinity. can handle negative first value, but not negative second value
+int64_t div_floor(int64_t a, uint64_t b) {
+	if (a < 0) return -1 - (-1 - a) / b;
+	return a / b;
+};
+
+uint64_t div_floor(uint64_t a, uint64_t b) {
+	return a / b;
+};
+
+int div_floor(int a, unsigned b) {
+	if (a < 0) return -1 - (-1 - a) / b;
+	return a / b;
+};
+
+unsigned div_floor(unsigned a, unsigned b) {
+	return a / b;
+};
+
+inline uint64_t positive_modulo(int64_t a, uint64_t b) {
+	return a - div_floor(a, b) * b;
+}
diff --git a/frame_time_measurement.cpp b/frame_time_measurement.cpp
@@ -0,0 +1,100 @@
+#pragma once
+#include "helper.h"
+#include "renderer.h"
+
+float GPU_swap_delay_in_ms = 0.69; //tailored for Intel HD 4000. no idea about other GPUs. 0.7 is too long for this GPU
+//there is no consistency.
+//new graphics card: Intel Iris Xe Graphics G7 80EUs
+//the delay is determined by the frequency of the GPU, and varies by 40% of the frame. you can see the GPU's frequency step around as its frequency changes.
+//to check this, I installed tlp, and changed these configurations, which forced the GPU to its lowest frequency, and stabilized the tearline at 2/5 down the screen:
+//INTEL_GPU_MAX_FREQ_ON_AC=100
+//INTEL_GPU_BOOST_FREQ_ON_AC=100
+//INTEL_GPU_MIN_FREQ_ON_AC=100
+
+float render_overrun_in_ms = 0.8; //you wait, then render, then wait, then swap. this specifies how much extra time the render sometimes takes, compared to frame_time.
+//there exists a constant term, at least. because on my Intel HD 4000, rendering nearly nothing still results in overruns.
+
+//the GPU timestamps are async: you might not be able to retrieve them until a long time later. this causes index_lagging_GPU_time_to_retrieve
+//OpenGL Insights reports that availability might be 5 frames late. P500: "Depending on the frame rate and CPU overhead, the number of rendering frames the GPU is running behind the CPU can be observed to be as high as five frames, depending on the GPUs performance and driver settings"
+//so we'll use a 16 frame circular buffer. 8 frames double-buffered. 2 measurements per frame.
+//we replace old entries without checking if they've really been finished; we just assume everything is finished after 5 frames. 8 > 5 provides buffer room.
+//actually, it can be even longer than that. in the sine wave animation, when frames are being spammed out with no regards to vsync, I see a few OpenGL errors with glGetQueryObjectiv. this is with an 8 frame buffer. so I doubled them again.
+constexpr uint frame_time_buffer_size = 32;
+constexpr uint GPU_timestamp_buffer_size = frame_time_buffer_size / 2;
+
+//we don't use a circular buffer because we are grabbing memory from strange locations, and the notation doesn't fit very well
+array<GLuint, frame_time_buffer_size> query_double_buffer = {}; //initialize to 0: glDeleteQueries ignores zeros, and in the first few loops, we delete queries that haven't been created yet.
+uint index_next_query_available = 0;
+uint index_lagging_GPU_time_to_retrieve = 0; //should be less than index_next_query_available
+array<GLuint64, frame_time_buffer_size / 2> GPU_start_times;
+double frame_time = 0; //in seconds.
+double single_frame_time = 0; //the most recently stored frame time, in seconds
+
+const uint generate_queries_per_batch = 2; //power of 2, <= frame_time_buffer_size / 2. (higher batches will decrease the effective buffer size and cause some frames to take longer, in a spiky fashion. higher throughput, more jitter)
+
+//single_frame_time is an (unstated) argument. it's pulled as a global.
+void update_frame_time() {
+	//we want a nonlinear filter. if the frame time jumps up, we should jump up quickly. if the frame time drops, we should drop slowly. because there's a nonlinearity - missing the vsync is very bad, but being early is fine.
+	//thus, square the input times - this applies an upward nonlinearity.
+	//then we apply a linear filter to the squares.
+	//squaring creates a restriction: the filter coefficients must all be positive. taking the square root of a negative value makes no sense, and a linear filter can create negative values.
+	//so the clever phase-locking filter of the audio thread is not allowed.
+
+	//nah, we have an even better mechanism. it's like L2, but with exponential weight.
+	//this represents the recency bias more accurately - more recent information is better.
+
+	const double decay_time_Hz = 5;
+	double remainder_exponential = std::exp(-(single_frame_time + 0.004) * decay_time_Hz); //we subtract add 4 ms because single_frame_time is not the time between frames - it's the time to render a single frame. that means it can be tiny, and hence be given no weight.
+	frame_time = frame_time * remainder_exponential + single_frame_time * (1 - remainder_exponential);
+	//outc("frame time", frame_time);
+}
+
+void generate_OpenGL_Queries() {
+	//glGenQueries() is effectively instant CPU-wise, but it's heavy on the GPU
+	//performance: my Intel GPU swap line quantum is 16 px, on a 1920/1080 screen. so each quantum is 1/67.5 of the screen time, 0.2 ms.
+	//then, generating 64 queries takes 3 quanta. thus, generating multiple queries simultaneously is not free. I don't know what the constant factor is (for generating 1 query), but I think max latency is a bigger problem than throughput as long as the frame time is less than the screen refresh rate. so we'll generate as few queries per batch as possible
+	//generating queries in batches causes gaps in the sine wave animation. changing the batch size causes the gaps to change.
+	if (index_next_query_available % generate_queries_per_batch == 0 || generate_queries_per_batch == 2) {
+		//glDeleteQueries(generate_queries_per_batch, &query_double_buffer[index_next_query_available % frame_time_buffer_size]); //deletion is very expensive! it's making the sine wave animation jitter like crazy. but if you don't delete old queries, the GPU pauses for a while at program shutdown while it cleans up the Queries you didn't delete
+		//we can delete either here, or after they are finished retrieving. I think it's better to delete them here, so there is exactly one deletion per frame, rather than after retrieval, when there might be two deletions per frame, and sometimes zero. deleting them here does cause more Queries to be alive simultaneously
+		//if we keep 65536 Queries alive by not deleting anything and having a 65536-size buffer, it does cause longer horizontal bars over time. that means as queries accumulate, they are causing problems.
+		//that means having stale Queries causes performance issues. so we'll delete them as they finish, rather than here
+		glGenQueries(generate_queries_per_batch, &query_double_buffer[index_next_query_available % frame_time_buffer_size]);
+	}
+}
+
+uint64_t frames_since_discarded = 64;
+
+void finished_time_retrieval(uint64_t GPU_finish_time) {
+	glDeleteQueries(generate_queries_per_batch, &query_double_buffer[(index_lagging_GPU_time_to_retrieve - 1) % frame_time_buffer_size]); //either this delete is active, or the one in generate_OpenGL_Queries() is active, but not both. check its compatriot in generate_OpenGL_Queries() for documentation
+	unsigned index = index_lagging_GPU_time_to_retrieve / 2;
+	uint64_t time_taken_in_GPU_ticks = GPU_finish_time - GPU_start_times[index % GPU_timestamp_buffer_size];
+	single_frame_time = time_taken_in_GPU_ticks / std::pow(10, 9);
+
+	//note: if the screen is minimized, its timing is still counted! you get a super long frame time. which then can fuck up the frame time estimates.
+	//there's no way to distinguish between super-slow frames (for example, while waiting for a Fourier transform calculation) vs just being minimized. we can't throw away the information even though it looks bad, because it might not be bad, and then it would be important.
+	//thus, garbage goes into the lowpass estimation algorithm. which is not designed to handle it.
+	auto cap_frame_time = 2.0 / system_claimed_monitor_Hz; //2 frames max. we can impose this low cap because all high frame times cause equivalent behavior - the renderer simply stops trying to measure things
+	if (single_frame_time < cap_frame_time) {
+		++frames_since_discarded;
+	}
+	else {
+		if (frames_since_discarded >= 64) {
+			frames_since_discarded = 0;
+			return; //don't change the frame_time. this is an exceptional frame. we can do this only once in a while.
+		}
+		else {
+			frames_since_discarded = 0;
+			single_frame_time = cap_frame_time;
+		}
+	}
+	//cap. this minimizes distortion; if frame times are so long, the algorithm isn't doing anything productive anyway.
+	//when a too-long frame appears, the estimated frame time leaps. this causes the render loop to think it will take too long to sync to the vblank. hence it simply spam-renders.
+	//spam-rendering has the positive (coincidental) consequence that it feeds more times into the time-estimation mechanism, flushing away the incorrect time judgment. so it's a very self-correcting behavior.
+	//actually, we can discard high jumps, as long as it's only one in a long while.
+
+	//in the future, we might use a distribution approach. it will estimate percentiles. in our current approach, we still have to worry about variance, so it's not that great.
+	//that is, current stability might predict future stability. maybe.
+
+	update_frame_time();
+}
diff --git a/glfw include.h b/glfw include.h
@@ -0,0 +1,24 @@
+#pragma once
+#include "helper.h"
+
+#if __linux__
+#define GLFW_EXPOSE_NATIVE_X11
+#define GL_GLEXT_PROTOTYPES
+#define GLFW_INCLUDE_GLCOREARB
+#elif _WIN32
+#define LOAD_WITH_GLAD 1
+#define GLFW_INCLUDE_NONE
+#include <glad/glad.h>
+#endif
+
+#define LINUX_EPOLL 0
+#if LINUX_EPOLL
+#define GLFW_EXPOSE_NATIVE_X11
+//#define GLFW_EXPOSE_NATIVE_WAYLAND
+#include "GLFW/glfw3.h"
+#else
+#include "GLFW/glfw3.h"
+#endif
+GLFWmonitor* active_monitor; //what if there are multiple monitors active? future
+single_def GLFWwindow* window;
+inline bool time_to_exit() { return glfwWindowShouldClose(window); }
diff --git a/helper.h b/helper.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <array>
+#include <cmath>
+#include <numbers>
+#include <ranges>
+#include <span>
+#include <vector>
+
+using std::array;
+using std::pair;
+using std::span;
+using std::string_view;
+using std::tuple;
+using std::vector;
+using uint = unsigned;
+using u32 = uint32_t;
+using u64 = uint64_t;
+using int32 = int32_t;
+using int64 = int64_t;
+
+#ifndef single_def
+#define single_def
+//#define single_def inline
+#endif
+
+auto zero_to(unsigned n) { return std::ranges::views::iota(0u, n); }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted.

		THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.