-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearn_cuda_chap_1_4.cu
537 lines (482 loc) · 23.2 KB
/
learn_cuda_chap_1_4.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/*
* This program is about rendering Julia Set in real time, which combines the technologies of parallel computing
* and interoperation between cuda and opengl. The final result is pretty fancy
*
* w,a,s,d or mouse for movement
* q,e or scroll for scaling
* z,x, c,v for tuning the complex number C(Z=Z^2+C)
*/
#include "glad/glad.h"
#include "GLFW/glfw3.h"
#include <iostream>
#include "includes/book.h"
#include <chrono>
#include "Shader.h"
#include <cuda_gl_interop.h>
int chapter1();
int chapter2_vector_addition();
int chapter2_julia_set();
float scale = 1.5f; float shift_x = 0.0f; float shift_y = 0.0f; float delta_time = 0.0f;
bool first_mouse = true; float last_x; float last_y; float last_frame = 0.0f; bool need_redraw = true;
const int DIM = 2048;
void process_input(GLFWwindow* window);
void framebuffer_size_callback(GLFWwindow* window, int width, int height);
void scroll_callback(GLFWwindow* window, double unknown_offset, double scroll_offset);
void mouse_callback(GLFWwindow* window, double width_pos, double height_pos);
struct complex_number {
float r;
float i;
// supports both host init and device init
__host__ __device__ complex_number(float r_, float i_): r(r_), i(i_) {}
// [[nodiscard]] means that if the return value is not used, there will be a compiler error
[[nodiscard]]__device__ float magnitude() const { return r * r + i * i; }
__device__ complex_number operator*(const complex_number &rhs) const {
return {r * rhs.r - i * rhs.i, r * rhs.i + i * rhs.r};
}
__device__ complex_number operator+(const complex_number &rhs) const {
return {r + rhs.r, i + rhs.i};
}
};
complex_number C(-0.8f, 0.156f);
__device__ float julia(const int x, const int y, const int DIM, const complex_number C,
float scale_dev, float shift_dev_x, float shift_dev_y) {
// we want it to scale at the center of (shift_dev_x, shift_dev_y)
float jx = scale_dev * (float)(DIM / 2 - x + shift_dev_x / scale_dev);
float jy = scale_dev * (float)(DIM / 2 - y + shift_dev_y / scale_dev);
jx /= DIM / 2;
jy /= DIM / 2;
complex_number Z(jx, jy);
// max iter = 200 (hardcoded)
for (int i = 0; i != 200; ++i) {
Z = Z * Z + C;
if (Z.magnitude() > 4.0f) {
return (float)i;
}
}
return 0;
}
__global__ void add_integer(int a, int b, int* c) {
*c = a + b;
}
__global__ void add_vector(const int* a, const int* b, int* c, const int length) {
int tid = blockIdx.x;
if (tid < length) {
c[tid] = a[tid] + b[tid];
}
}
__global__ void compute_julia(unsigned char* ptr, complex_number C,
float scale_dev, float shift_dev_x, float shift_dev_y) {
int i = blockIdx.x;
int j = blockIdx.y;
int offset = i + j * gridDim.x;
int julia_value = julia(i, j, gridDim.x, C, scale_dev, shift_dev_x, shift_dev_y);
if (julia_value > 50) {
julia_value = 50;
}
// 3, 4, 5: silver
ptr[offset * 4 + 0] = 3 * julia_value;
ptr[offset * 4 + 1] = 4 * julia_value;
ptr[offset * 4 + 2] = 5 * julia_value;
ptr[offset * 4 + 3] = 255 * julia_value > 0 ? 255 : 0;
}
__global__ void compute_julia_interop(uchar4* ptr, complex_number C, float scale_dev, float shift_dev_x, float shift_dev_y) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * gridDim.x * blockDim.x;
float julia_value = julia(x, y, gridDim.x * blockDim.x, C, scale_dev, shift_dev_x, shift_dev_y);
// if (julia_value > 50.0f) {
// julia_value = 50.0f;
// }
// 3, 4, 5: silver
ptr[offset] = make_uchar4(3.01 * julia_value, 3.07 * julia_value, 3.10 * julia_value,
255 * (julia_value > 0 ? 1 : 0));
}
int main() {
chapter2_julia_set();
return 0;
}
int chapter2_julia_set() {
const int image_size = DIM * DIM * 4;
// auto* bitmap_sysmem = (unsigned char*)malloc(image_size);
// unsigned char* dev_map;
// zero-copy memory
// HANDLE_ERROR(cudaHostAlloc((void**)&zero_copy_map, image_size, cudaHostAllocMapped));
// HANDLE_ERROR(cudaHostGetDevicePointer((void**)&dev_map, zero_copy_map, 0));
// WOW: use a glfw to visualize the result, it's so cool!!!!!!!!!!!!!!!!!!!!!!
glfwInit();
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
GLFWmonitor* glfWmonitor = glfwGetPrimaryMonitor();
if (glfWmonitor == nullptr) {
std::cout << "Failed to get primary monitor.." << std::endl;
glfwTerminate();
return -1;
}
const GLFWvidmode *glfWvidmode = glfwGetVideoMode(glfWmonitor);
unsigned int RENDER_WIDTH = glfWvidmode->width;
unsigned int RENDER_HEIGHT = glfWvidmode->height;
GLFWwindow* window = glfwCreateWindow(RENDER_WIDTH, RENDER_HEIGHT,
"Colors", glfWmonitor, nullptr);
if (window == nullptr) {
std::cout << "Failed to create glf windows.." << std::endl;
glfwTerminate();
return -1;
}
glfwMakeContextCurrent(window);
if (!gladLoadGLLoader((GLADloadproc) glfwGetProcAddress)) {
std::cout << "Failed to init GLAD loader.." << std::endl;
glfwTerminate();
return -1;
}
glViewport(0, 0, RENDER_WIDTH, RENDER_HEIGHT);
glfwSetFramebufferSizeCallback(window, framebuffer_size_callback);
glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED);
glfwSetScrollCallback(window, scroll_callback);
glfwSetCursorPosCallback(window, mouse_callback);
// a rectangle in front of the camera
float vertices[] = {
// positions // texture coords
1.0f, 1.0f, 0.0f, 1.0f, 1.0f, // top right
1.0f, -1.0f, 0.0f, 1.0f, 0.0f, // bottom right
-1.0f, -1.0f, 0.0f, 0.0f, 0.0f, // bottom left
-1.0f, 1.0f, 0.0f, 0.0f, 1.0f // top left
};
unsigned int indices[] = {
0, 1, 3, // first triangle
1, 2, 3 // second triangle
};
Shader shader("./shaders/display_texture.vert", "./shaders/display_texture.frag");
shader.use();
unsigned int VBO, VAO, EBO;
// generate vertex array object
glGenVertexArrays(1, &VAO);
glBindVertexArray(VAO);
// generate vertex buffer object
glGenBuffers(1, &VBO);
// generate element buffer object
glGenBuffers(1, &EBO);
// bind the vertex array object
glBindVertexArray(VAO);
// copy vertices array into vertex buffer
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
// copy indices array into element buffer
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
// configure vertex attributes
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 5 * sizeof(float),
(void*) nullptr);
glEnableVertexAttribArray(0); // pos
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(float),
(void*) (3 * sizeof(float)));
glEnableVertexAttribArray(1); // texture coords
// texture
unsigned int texture;
glGenTextures(1, &texture);
glBindTexture(GL_TEXTURE_2D, texture);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, DIM, DIM, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
// 注册 OpenGL 缓冲区到 CUDA 图形资源
GLuint bufferObj;
cudaGraphicsResource *resource;
glGenBuffers(1, &bufferObj);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER, DIM * DIM * sizeof(uchar4), NULL, GL_DYNAMIC_DRAW);
HANDLE_ERROR(cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone));
uchar4* devPtr;
size_t size;
// 映射资源
HANDLE_ERROR(cudaGraphicsMapResources(1, &resource, NULL));
HANDLE_ERROR(cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &size, resource));
// we should immediately unmap the resource, according to the RULES
HANDLE_ERROR(cudaGraphicsUnmapResources(1, &resource, NULL));
dim3 blocks(DIM/32, DIM/32);
dim3 threads(32,32);
last_frame = (float)glfwGetTime();
// render loop
while (!glfwWindowShouldClose(window)) {
auto current_frame = (float)glfwGetTime();
delta_time = current_frame - last_frame;
last_frame = current_frame;
// std::cout << "FPS: " << 1.0f / delta_time << " Estimated PCIE bandwidth: "
// << 3.0f * (float)image_size / delta_time / 1024 / 1024 / 1024 << " GB/s" << std::endl;
// input
process_input(window);
// render
glClearColor(0.0f, 0.0f, 0.0f, 1.0f); // state-setting function
glClear(GL_COLOR_BUFFER_BIT); // state-using function
// memset the zero-copy memory
compute_julia_interop<<<blocks, threads>>>(devPtr, C,
scale, shift_x, shift_y);
HANDLE_ERROR(cudaDeviceSynchronize());
// draw the rectangle
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bufferObj);
glBindTexture(GL_TEXTURE_2D, texture);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
shader.use();
glBindVertexArray(VAO);
glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, nullptr);
// glfw: swap buffers and poll IO events (keys pressed/released, mouse moved etc.)
glfwSwapBuffers(window);
glfwPollEvents();
}
HANDLE_ERROR(cudaGraphicsUnregisterResource(resource));
// glfw: terminate, clearing all previously allocated GLFW resources.
// free(bitmap_sysmem);
glfwTerminate();
return 0;
}
/*
* This chapter is also about the basic usage of cuda, we parallelized the vector addition.
* And observed the performance.
*/
int chapter2_vector_addition() {
const unsigned int ARRAY_LENGTH = 65535;
bool identical = true;
int *a = (int*)malloc(ARRAY_LENGTH * sizeof(int));
int *b = (int*)malloc(ARRAY_LENGTH * sizeof(int));
int *c = (int*)malloc(ARRAY_LENGTH * sizeof(int));
int *cpu_temp_c = (int*)malloc(ARRAY_LENGTH * sizeof(int));
int *dev_a, *dev_b, *dev_c;
// first we assign values to a[] and b[]
for (int i = 0; i != ARRAY_LENGTH; ++i) {
a[i] = i;
b[i] = -2 * i + i % 5;
}
auto start_cpu = std::chrono::high_resolution_clock::now();
for (int i = 0; i != ARRAY_LENGTH; ++i) {
cpu_temp_c[i] = a[i] + b[i];
}
auto end_cpu = std::chrono::high_resolution_clock::now();
auto start_host_to_device = std::chrono::high_resolution_clock::now();
// allocate on gpu
HANDLE_ERROR(cudaMalloc((void**)&dev_a, ARRAY_LENGTH * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, ARRAY_LENGTH * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, ARRAY_LENGTH * sizeof(int)));
// copy to gpu
HANDLE_ERROR(cudaMemcpy(dev_a, a, ARRAY_LENGTH * sizeof(int),
cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, ARRAY_LENGTH * sizeof(int),
cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_c, c, ARRAY_LENGTH * sizeof(int),
cudaMemcpyHostToDevice));
auto end_host_to_device = std::chrono::high_resolution_clock::now();
auto start_gpu = std::chrono::high_resolution_clock::now();
add_vector<<<ARRAY_LENGTH, 1>>>
(dev_a, dev_b, dev_c, ARRAY_LENGTH);
HANDLE_ERROR(cudaDeviceSynchronize());
auto end_gpu = std::chrono::high_resolution_clock::now();
auto start_device_to_host = std::chrono::high_resolution_clock::now();
HANDLE_ERROR(cudaMemcpy(c, dev_c, ARRAY_LENGTH * sizeof(int),
cudaMemcpyDeviceToHost));
auto end_device_to_host = std::chrono::high_resolution_clock::now();
// check if the result is correct
for (int i = 0; i != ARRAY_LENGTH; ++i) {
if (c[i] != cpu_temp_c[i]) {
identical = false;
break;
}
}
if (identical) {
std::cout << "The result is correct" << std::endl;
} else {
std::cout << "The result is incorrect" << std::endl;
}
std::cout << "CPU time: " << std::chrono::duration_cast<std::chrono::microseconds>
(end_cpu - start_cpu).count() << " us" << std::endl;
std::cout << "GPU time: " << std::chrono::duration_cast<std::chrono::microseconds>
(end_gpu - start_gpu).count() << " us" << "+"
<< std::chrono::duration_cast<std::chrono::microseconds>
(end_host_to_device - start_host_to_device).count() << " us(host->device memcpy)"
<< "+" << std::chrono::duration_cast<std::chrono::microseconds>
(end_device_to_host - start_device_to_host).count() << " us(device->host memcpy)"
<< std::endl;
std::cout << "Hint: the copy time is really large, and gpu time is also not fascinating."
"We need to optimize this deeper." << std::endl;
// free memory
free(a);
free(b);
free(c);
free(cpu_temp_c);
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR(cudaFree(dev_b));
HANDLE_ERROR(cudaFree(dev_c));
return 0;
}
/*
* This chapter is about the basic usage of cuda, including:
* 1. cudaGetDeviceCount
* 2. cudaGetDeviceProperties
* 3. cudaMalloc
* 4. cudaFree
* 5. cudaMemcpy
* 6. kernel function
*/
int chapter1() {
int device_count;
HANDLE_ERROR(cudaGetDeviceCount(&device_count));
std::cout << "device count: " << device_count << std::endl;
for (int i = 0; i != device_count; ++i) {
std::cout << "Fetching device properties for device " << i << std::endl;
cudaDeviceProp device_prop{};
HANDLE_ERROR(cudaGetDeviceProperties(&device_prop, i));
std::cout << "Device name: " << device_prop.name << std::endl;
std::cout << "Compute capability: " << device_prop.major << "." << device_prop.minor << std::endl;
std::cout << "Total global memory: " << device_prop.totalGlobalMem << std::endl;
std::cout << "Shared memory per block: " << device_prop.sharedMemPerBlock << std::endl;
std::cout << "Registers per block: " << device_prop.regsPerBlock << std::endl;
std::cout << "Warp size: " << device_prop.warpSize << std::endl;
std::cout << "Memory pitch: " << device_prop.memPitch << std::endl;
std::cout << "Max threads per block: " << device_prop.maxThreadsPerBlock << std::endl;
std::cout << "Max threads dimensions: " << device_prop.maxThreadsDim[0] << " " << device_prop.maxThreadsDim[1] << " " << device_prop.maxThreadsDim[2] << std::endl;
std::cout << "Max grid size: " << device_prop.maxGridSize[0] << " " << device_prop.maxGridSize[1] << " " << device_prop.maxGridSize[2] << std::endl;
std::cout << "Clock rate: " << device_prop.clockRate << std::endl;
std::cout << "Total constant memory: " << device_prop.totalConstMem << std::endl;
std::cout << "Texture alignment: " << device_prop.textureAlignment << std::endl;
std::cout << "Concurrent copy and execution: " << device_prop.deviceOverlap << std::endl;
std::cout << "Number of multiprocessors: " << device_prop.multiProcessorCount << std::endl;
std::cout << "Kernel execution timeout enabled: " << device_prop.kernelExecTimeoutEnabled << std::endl;
std::cout << "Integrated: " << device_prop.integrated << std::endl;
std::cout << "Can map host memory: " << device_prop.canMapHostMemory << std::endl;
std::cout << "Supports page-locked memory: " << device_prop.canMapHostMemory << std::endl;
std::cout << "Compute mode: " << device_prop.computeMode << std::endl;
std::cout << "Maximum 1D texture size: " << device_prop.maxTexture1D << std::endl;
std::cout << "Maximum 2D texture size: " << device_prop.maxTexture2D[0] << " " << device_prop.maxTexture2D[1] << std::endl;
std::cout << "Maximum 3D texture size: " << device_prop.maxTexture3D[0] << " " << device_prop.maxTexture3D[1] << " " << device_prop.maxTexture3D[2] << std::endl;
std::cout << "Maximum 1D layered texture dimensions: " << device_prop.maxTexture1DLayered[0] << " " << device_prop.maxTexture1DLayered[1] << std::endl;
std::cout << "Maximum 2D layered texture dimensions: " << device_prop.maxTexture2DLayered[0] << " " << device_prop.maxTexture2DLayered[1] << " " << device_prop.maxTexture2DLayered[2] << std::endl;
std::cout << "Surface alignment: " << device_prop.surfaceAlignment << std::endl;
std::cout << "Concurrent kernels: " << device_prop.concurrentKernels << std::endl;
std::cout << "ECC enabled: " << device_prop.ECCEnabled << std::endl;
std::cout << "PCI bus ID: " << device_prop.pciBusID << std::endl;
std::cout << "PCI device ID: " << device_prop.pciDeviceID << std::endl;
std::cout << "PCI domain ID: " << device_prop.pciDomainID << std::endl;
std::cout << "TCC driver: " << device_prop.tccDriver << std::endl;
std::cout << "Async engine count: " << device_prop.asyncEngineCount << std::endl;
std::cout << "Unified addressing: " << device_prop.unifiedAddressing << std::endl;
std::cout << "Memory clock rate: " << device_prop.memoryClockRate << std::endl;
std::cout << "Memory bus width: " << device_prop.memoryBusWidth << std::endl;
std::cout << "L2 cache size: " << device_prop.l2CacheSize << std::endl;
std::cout << "Max threads per multiprocessor: " << device_prop.maxThreadsPerMultiProcessor << std::endl;
std::cout << "Stream priorities: " << device_prop.streamPrioritiesSupported << std::endl;
std::cout << "Global L1 cache supported: " << device_prop.globalL1CacheSupported << std::endl;
std::cout << "Local L1 cache supported: " << device_prop.localL1CacheSupported << std::endl;
std::cout << "Shared memory per multiprocessor: " << device_prop.sharedMemPerMultiprocessor << std::endl;
std::cout << "Registers per multiprocessor: " << device_prop.regsPerMultiprocessor << std::endl;
std::cout << "Managed memory: " << device_prop.managedMemory << std::endl;
std::cout << "Is multi-GPU board: " << device_prop.isMultiGpuBoard << std::endl;
std::cout << "Multi-GPU board group ID: " << device_prop.multiGpuBoardGroupID << std::endl;
std::cout << "Host native atomic supported: " << device_prop.hostNativeAtomicSupported << std::endl;
std::cout << "Single to double precision perf ratio: " << device_prop.singleToDoublePrecisionPerfRatio << std::endl;
std::cout << "Pageable memory access: " << device_prop.pageableMemoryAccess << std::endl;
std::cout << "Concurrent managed access: " << device_prop.concurrentManagedAccess << std::endl;
std::cout << "Compute preemption supported: " << device_prop.computePreemptionSupported << std::endl;
std::cout << "Can use host pointer for registered memory: " << device_prop.canUseHostPointerForRegisteredMem << std::endl;
std::cout << "Cooperative launch: " << device_prop.cooperativeLaunch << std::endl;
std::cout << "Cooperative multi-device launch: " << device_prop.cooperativeMultiDeviceLaunch << std::endl;
std::cout << "Pageable memory access uses host page tables: " << device_prop.pageableMemoryAccessUsesHostPageTables << std::endl;
std::cout << "Direct managed memory access from host: " << device_prop.directManagedMemAccessFromHost << std::endl;
std::cout << "Max blocks per multiprocessor: " << device_prop.maxBlocksPerMultiProcessor << std::endl;
}
int device_id;
HANDLE_ERROR(cudaGetDevice(&device_id));
std::cout << "Current device id: " << device_id << std::endl;
cudaDeviceProp ideal_device_prop{};
memset(&ideal_device_prop, 0, sizeof(cudaDeviceProp));
ideal_device_prop.major = 1;
ideal_device_prop.minor = 3;
HANDLE_ERROR(cudaChooseDevice(&device_id, &ideal_device_prop));
std::cout << "Ideal device id: " << device_id << std::endl;
HANDLE_ERROR(cudaSetDevice(device_id));
std::cout << "Setting current device id to " << device_id << std::endl;
int* result = (int*)malloc(sizeof(int));
int* dev_result;
// HANDLE_ERROR: simple implementation of error catch
// cudaMalloc: Just like malloc, but the parameter is pointer of pointer
HANDLE_ERROR(cudaMalloc((void**)&dev_result, sizeof(int)));
add_integer<<<1, 1>>>(3, 4, dev_result);
// the following code is not allowed, because modifying cuda memory out of add_integer function
// will result in memory error
// *dev_result = 5;
// copy the cuda memory from device gpu to host system dram
HANDLE_ERROR(
cudaMemcpy(result, dev_result, sizeof(int), cudaMemcpyDeviceToHost)
);
std::cout << "Hello, World! result=" << *result << std::endl;
cudaFree(dev_result);
free(result);
return 0;
}
void framebuffer_size_callback(GLFWwindow* window, int width, int height) {
need_redraw = true;
glViewport(0, 0, width, height);
}
void process_input(GLFWwindow* window) {
// glfwGetKey: get the state of the key
if (glfwGetKey(window, GLFW_KEY_ESCAPE) == GLFW_PRESS) {
glfwSetWindowShouldClose(window, true);
}
// wasd
float pixel_per_input = 30 * scale;
if (glfwGetKey(window, GLFW_KEY_W) == GLFW_PRESS) {
shift_y -= pixel_per_input;
}
if (glfwGetKey(window, GLFW_KEY_S) == GLFW_PRESS) {
shift_y += pixel_per_input;
}
if (glfwGetKey(window, GLFW_KEY_A) == GLFW_PRESS) {
shift_x += pixel_per_input;
}
if (glfwGetKey(window, GLFW_KEY_D) == GLFW_PRESS) {
shift_x -= pixel_per_input;
}
// q, e
if (glfwGetKey(window, GLFW_KEY_Q) == GLFW_PRESS) {
scale *= 1.0f + 0.7f * delta_time;
}
if (glfwGetKey(window, GLFW_KEY_E) == GLFW_PRESS) {
scale /= 1.0f + 0.7f * delta_time;
}
// z to increase real part of C, x to decrease real part of C
if (glfwGetKey(window, GLFW_KEY_Z) == GLFW_PRESS) {
C.r += 0.0001f;
}
if (glfwGetKey(window, GLFW_KEY_X) == GLFW_PRESS) {
C.r -= 0.0001f;
}
// c to increase imaginary part of C, v to decrease imaginary part of C
if (glfwGetKey(window, GLFW_KEY_C) == GLFW_PRESS) {
C.i += 0.0001f;
}
if (glfwGetKey(window, GLFW_KEY_V) == GLFW_PRESS) {
C.i -= 0.0001f;
}
}
void scroll_callback(GLFWwindow* window, double unknown_offset, double scroll_offset) {
need_redraw = true;
const float coefficient = 0.1f * scale;
scale -= coefficient * (float)scroll_offset;
if (scale <= 0.0f) {
scale = 0.00001f;
}
// std::cout << "scale: " << scale << std::endl;
}
void mouse_callback(GLFWwindow* window, double width_pos, double height_pos) {
if (first_mouse) {
last_x = width_pos;
last_y = height_pos;
first_mouse = false;
}
float x_offset = width_pos - last_x;
float y_offset = last_y - height_pos;
last_x = width_pos;
last_y = height_pos;
const float sensitivity = delta_time * 100.0f * scale;
x_offset *= sensitivity;
y_offset *= sensitivity;
shift_x -= x_offset;
shift_y -= y_offset;
// std::cout << "shift_x: " << shift_x << " shift_y: " << shift_y << std::endl;
}