forked from adrian-bl/libfairydust
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfairydust.c
624 lines (461 loc) · 15.4 KB
/
fairydust.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
/*
*
* Hijack some Cuda and OpenCL API-Calls
*
* (C) 2010-2011 Adrian Ulrich / ETHZ
*
* Licensed under the terms of `The Artistic License 2.0'
* See: ../artistic-2_0.txt or http://www.perlfoundation.org/attachment/legal/artistic-2_0.txt
*
*/
#include "fairydust.h"
/* java doesn't play well with RTLD_NEXT -> needs dlopen() */
static void *cuda_lib = NULL;
int cuda_set_device = -1;
/**************************************************************************************
* OpenCL part
***************************************************************************************/
/*
* Emulate clGetDeviceIDs
*/
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform , cl_device_type device_type,
cl_uint num_entries , cl_device_id *devices,
cl_uint *num_devices) {
static void * (*nv_gdi) (); /* libOpenCL's version of clGetDeviceIDs */
cl_uint internal_num_devices; /* number of (all) hw-devices */
cl_device_id *internal_devices; /* will hold the IDs of ALL devices */
cl_int nv_return, lock_cnt, i, foo;
if(platform == NULL)
RMSG(HINT_OCL_PLATFORM_NULL);
/* init call to libOpenCL */
if(!nv_gdi)
nv_gdi = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
/* init libfairydust and get number of useable devices */
__fdust_init();
lock_cnt = _xxGetNumberOfLockedDevices();
/* Get the number of physical devices */
nv_return = nv_gdi(platform, device_type, NULL, NULL, &internal_num_devices);
assert( nv_return == CL_SUCCESS ); /* this shouldn't fail in any case */
assert( lock_cnt > 0 ); /* could we lock anything ? */
assert( internal_num_devices >= lock_cnt ); /* Did we lock too many devices (??!) */
/* caller wants to know how many devices matched -> fixup the result */
if(num_devices != NULL)
*num_devices = lock_cnt;
DPRINT("Hardware has %d physical devices, returning %d (num_devices=%p)\n", internal_num_devices, lock_cnt, num_devices);
/* caller (also) want's to actually get a list of devices */
if(num_entries > 0 && devices != NULL) {
if( num_entries > lock_cnt) { /* Caller requested more devices than available, well: we don't care */
num_entries = lock_cnt;
}
internal_devices = (cl_device_id *)malloc(internal_num_devices*sizeof(cl_device_id));
if(!internal_devices)
return CL_OUT_OF_HOST_MEMORY;
nv_return = nv_gdi(platform, device_type, internal_num_devices, internal_devices, NULL);
if(nv_return != CL_SUCCESS)
return nv_return;
DPRINT("We got %d devices, caller requested %d of them\n", internal_num_devices, num_entries);
for(i=0;i<num_entries;i++) {
foo = _xxGetPhysicalDevice(i);
assert( foo >= 0 );
ocl_ptrcache[i] = devices[i] = internal_devices[foo]; // fake info for caller
DPRINT("physical_device=%d, fake_device=%d, pointer=%p, want=%d\n", foo, i, internal_devices[foo], num_entries);
}
}
return nv_return;
}
/*
* Emulate clGetDeviceInfo -> just add the fdust: tag to the name
*/
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device_id , cl_device_info param_name,
size_t param_value_size , void *param_value,
size_t *param_value_size_ret) {
static void * (*nv_gdx) (); /* libOpenCL's version of clGetDeviceInfo */
cl_int gdx_return;
if(!nv_gdx)
nv_gdx = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
__fdust_init();
gdx_return = nv_gdx(device_id, param_name, param_value_size, param_value, param_value_size_ret);
if(gdx_return == CL_SUCCESS && param_name == CL_DEVICE_NAME) {
param_value_size_ret = NULL; // requests caller to ignore and was set to NULL in Cuda 3.2RC anyway...
_xxAddDeviceMapping(param_value, param_value_size, _xxGetPhysicalDevice(_xxGetFakedevFromClPtr(device_id)), _xxGetFakedevFromClPtr(device_id));
}
return gdx_return;
}
/**************************************************************************************
* CUDA part
***************************************************************************************/
/*
* Cuda provides a Runtime-API and a 'driver' API with almost the same syntax.
* ..but for some reason, we cannot use the RT-Api in while executing a Driver
* function (and vice versa). This is the reason why we have the __cuda_cruft*
* functions.
*
*/
/************************************ CUDA RUNTIME API ************************************/
/*
* Implements GetDeviceCount stuff:
*/
CUresult __cuda_cruft_GetDeviceCount(CUresult errcode, int *count) {
int lock_cnt = -1;
__fdust_init();
if(errcode == CUDA_SUCCESS) {
lock_cnt = _xxGetNumberOfLockedDevices();
assert( lock_cnt > 0 );
assert( *count >= lock_cnt );
*count = lock_cnt;
}
return errcode;
}
/*
* CURT version of cuGetDeviceCount
*/
extern __host__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) {
void * (*CURT_cgdc) ();
__fdust_init();
CURT_cgdc = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
return __cuda_cruft_GetDeviceCount( (CUresult)CURT_cgdc(count), count );
}
/*
* Implements GetDeviceProperties
*/
extern __host__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
void * (*CURT_cgdp) ();
cudaError_t rval;
__fdust_init();
CURT_cgdp = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
rval = (cudaError_t)CURT_cgdp(prop, _xxGetPhysicalDevice(device));
/* add fairydust description to devname if everything worked out */
if(rval == cudaSuccess) {
_xxAddDeviceMapping(&prop->name, CUDA_PROP_NAME_LEN, _xxGetPhysicalDevice(device), device);
}
return rval;
}
/*
* Sets context to a specific device
*/
extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
void * (*CURT_csd) ();
cudaError_t nvreturn;
__fdust_init();
CURT_csd = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
nvreturn = (cudaError_t)CURT_csd(_xxGetPhysicalDevice(device));
if(nvreturn == CUDA_SUCCESS)
cuda_set_device = device;
return nvreturn;
}
/*
* ??
*/
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device) {
void * (*CURT_cglsgld) ();
cudaError_t nvreturn;
__fdust_init();
CURT_cglsgld = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
nvreturn = (cudaError_t)CURT_cglsgld(_xxGetPhysicalDevice(device));
if(nvreturn == CUDA_SUCCESS)
cuda_set_device = device;
return nvreturn;
}
/*
* Chooses a device based on properties -> does nothing in libfairydust (always sets device to fakedev#0)
*/
extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
RMSG("oh noes! call to cudaChooseDevice ignored, retuning first device");
*device = 0;
return CUDA_SUCCESS;
}
/*
* noop - should be implemented some day
*/
extern __host__ cudaError_t CUDARTAPI cudaGetDevice(int *device) {
if(cuda_set_device >= 0) {
*device = cuda_set_device;
}
else {
*device = 0;
DPRINT("*WARNING* cudaGetDevice() called -> device was never initialized, returning %d",*device);
}
return CL_SUCCESS;
}
/*
* imlements cudaSetValidDevices
*/
extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len) {
void * (*CURT_csvd) ();
int i;
cudaError_t nvreturn;
__fdust_init();
CURT_csvd = (void *(*) ()) dlsym(RTLD_NEXT, __func__);
/* convert the virtual device id's into physical ids */
for(i=0;i<len;i++) {
device_arr[i] = _xxGetPhysicalDevice(device_arr[i]);
}
nvreturn = (cudaError_t)CURT_csvd(device_arr,len);
/* ..and back (we do not care about the return code */
for(i=0;i<len;i++) {
device_arr[i] = _xxGetVirtualDevice(device_arr[i]);
}
return nvreturn;
}
/************************************ CUDA DRIVER API ************************************/
/*
* Creates a new Ctx
*/
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev ) {
void * (*cu_ctxcreate) ();
CUdevice hwdev;
__fdust_init();
cu_ctxcreate = __cu_dlsym(__func__);
hwdev = _xxGetPhysicalDevice(dev);
DPRINT(" ***** routing virtual device %d to hardware device %d\n", dev, hwdev);
return (CUresult)cu_ctxcreate(pctx, flags, hwdev);
}
/*
* The obsoleted cuCtxCreate call. I have NO idea why nvidia renamed this to _v2 ...
*/
#ifndef cuCtxCreate
#error "cuCtxCreate is not re-defined, but it should be on cuda >= 3.2"
#else
#undef cuCtxCreate
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev ) {
void * (*cu_ctxcreate) ();
CUdevice hwdev;
__fdust_init();
cu_ctxcreate = __cu_dlsym("cuCtxCreate_v2");
hwdev = _xxGetPhysicalDevice(dev);
RMSG("OBSOLETED API call!");
return (CUresult)cu_ctxcreate(pctx, flags, hwdev);
}
// fixme: should check if this was true in the first place:
#define cuCtxCreate cuCtxCreate_v2
#endif
/*
* Returns the current device in Ctx
*/
CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
void * (*cu_ctxgetdev) ();
CUresult nvreturn;
CUdevice ctxdev;
__fdust_init();
cu_ctxgetdev = __cu_dlsym(__func__);
nvreturn = (CUresult)cu_ctxgetdev(&ctxdev);
if(nvreturn == CUDA_SUCCESS) {
*device = _xxGetVirtualDevice(ctxdev);
}
return nvreturn;
}
/*
* Returns the number of available devices
*/
CUresult CUDAAPI cuDeviceGetCount(int *count) {
void * (*cu_px) ();
__fdust_init();
cu_px = __cu_dlsym(__func__);
return __cuda_cruft_GetDeviceCount( (CUresult)cu_px(count), count );
}
/*
* Returns a device handle for fakedev specified in 'ordinal'
*/
CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
void * (*cu_px) ();
CUresult nvreturn;
__fdust_init();
cu_px = __cu_dlsym(__func__);
nvreturn = (CUresult)cu_px(device, _xxGetPhysicalDevice(ordinal));
if(nvreturn == CL_SUCCESS)
*device = _xxGetVirtualDevice(*device);
return nvreturn;
}
/*
* Return compute capability
*/
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) {
void * (*cu_px) ();
__fdust_init();
cu_px = __cu_dlsym(__func__);
return (CUresult)cu_px(major, minor, _xxGetPhysicalDevice(dev));
}
/*
* Return total memory for given device
*/
#if CUDA_VERSION >= 3020
CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
#else
CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev) {
#endif
void * (*cu_px) ();
__fdust_init();
cu_px = __cu_dlsym(__func__);
return (CUresult)cu_px(bytes, _xxGetPhysicalDevice(dev));
}
/*
* Fixes up the devname to include fdust stuff :)
*/
CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
void * (*cu_px) ();
CUresult nvreturn;
__fdust_init();
cu_px = __cu_dlsym(__func__);
nvreturn = (CUresult)cu_px(name, len, _xxGetPhysicalDevice(dev));
if(nvreturn == CUDA_SUCCESS) {
_xxAddDeviceMapping(name, len, _xxGetPhysicalDevice(dev), dev);
}
return nvreturn;
}
/*
* Returns device properties of given device
*/
CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev) {
void * (*cu_px) ();
__fdust_init();
cu_px = __cu_dlsym(__func__);
return (CUresult)cu_px(prop, _xxGetPhysicalDevice(dev));
}
/*
* Returns attributes for given device
*/
CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
void * (*cu_px) ();
__fdust_init();
cu_px = __cu_dlsym(__func__);
return (CUresult)cu_px(pi, attrib, _xxGetPhysicalDevice(dev));
}
/*
* This MIGHT be a performance hog: __fdust_init could actually patch-this-out if it gets called
* but i didn't figure out how to do this (yet)
*/
extern __host__ cudaError_t cudaMalloc (void ** devptr, size_t size) {
void * (*cu_malloc) ();
if(cuda_set_device < 0) {
RMSG("OUCH! Your application just called cudaMalloc(), but no device was set!");
RMSG("Simulating driver bug: setting first device as in-use");
cudaSetDevice(0);
}
cu_malloc = dlsym(RTLD_NEXT,__func__);
return (cudaError_t)cu_malloc(devptr, size);
}
#ifdef _FDUST_PROFILE
/* Disabled per default: This can have a negative impact on performance */
extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
void * (*cu_perf_thread_sync)();
__fdust_init();
cu_perf_thread_sync = dlsym(RTLD_NEXT,__func__);
return (cudaError_t)cu_perf_thread_sync();
}
extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
void * (*cu_perf_thread_exit)();
__fdust_init();
cu_perf_thread_exit = dlsym(RTLD_NEXT,__func__);
return (cudaError_t)cu_perf_thread_exit();
}
cudaError_t cudaLaunch (const char *entry) {
void * (*cu_perf_launch)();
__fdust_init();
cu_perf_launch = dlsym(RTLD_NEXT,__func__);
fdust_perfc.cu_kernel_launch++;
return (cudaError_t)cu_perf_launch(entry);
}
#endif
void cublasInit() {
void * (*cublas)();
if(cuda_set_device < 0) {
RMSG("OUCH! Called cublasInit() with no active device! calling cudaSetDevice(0) right now\n");
cudaSetDevice(0);
}
cublas = dlsym(RTLD_NEXT,__func__);
cublas();
}
/**************************************************************************************
* Misc stuff
***************************************************************************************/
/*
* Initializes the library. This should get called by ALL functions
* __fdust_init does..
* -> set the debug level
* -> contact fairyd / lock devices
*/
void __fdust_init() {
if(reserved_devices[0] == FDUST_RSV_NINIT) {
__fdust_spam();
lock_fdust_devices(reserved_devices, FDUST_MODE_NVIDIA);
printf("%s allocated gpu-count: %d device(s)\n", __FILE__, _xxGetNumberOfLockedDevices());
}
}
/*
* Return 'modeid'
*/
const char *__fdust_mode() {
return "cuda.so";
}
/*
* dlsym wrapper
*/
void *__cu_dlsym(const char *func) {
/* We do not know when (and how) we get called for the first time, so we cannot
* 'pre-init' cuda */
if(cuda_lib == NULL)
cuda_lib = dlopen(LIBCUDA_SONAME, RTLD_NOW);
assert(cuda_lib != NULL);
return dlsym(cuda_lib, func);
}
/*
* Returns the PHYSICAL device id of given fakedev
*/
cl_uint _xxGetPhysicalDevice(cl_uint virtual_i) {
cl_int result;
assert(virtual_i < MAX_GPUCOUNT);
result = reserved_devices[virtual_i];
DPRINT("virtual device=%d, physical device=%d\n", virtual_i, result);
assert( result >= 0 );
return (cl_uint)result;
}
/*
* Returns the FAKEID for given physical device
*/
cl_uint _xxGetVirtualDevice(cl_uint physical_i) {
cl_uint i;
DPRINT("physical_i=%d\n",physical_i);
assert(physical_i < MAX_GPUCOUNT);
for(i=0;i<MAX_GPUCOUNT;i++) {
if(reserved_devices[i] == physical_i)
return i;
if(reserved_devices[i] < 0)
break;
}
/* reached on error */
assert(0);
}
/*
* Returns the number of locked devices
*/
cl_uint _xxGetNumberOfLockedDevices() {
cl_uint i;
for(i=0;i<MAX_GPUCOUNT;i++) {
if(reserved_devices[i] == FDUST_RSV_END)
return i;
}
return -1;
}
/*
* Takes an OpenCL device pointer and returns the FAKEDEV for it
*/
cl_uint _xxGetFakedevFromClPtr(cl_device_id clptr) {
cl_int i=0;
for(i=0;i<MAX_GPUCOUNT;i++) {
if(ocl_ptrcache[i] == clptr)
return i;
}
RMSG("Ooops! shouldn't be here -> somehow ocl_ptrcache got messed up?!");
abort();
return 0; // make gcc happy!
}
/*
* Includes some debug infos in the device description
*/
void _xxAddDeviceMapping(void *param_value, size_t malloced_bytes, cl_int real_dev, cl_int fake_dev) {
char fdust_devname[CUDA_PROP_NAME_LEN] = {0};
sprintf(fdust_devname, " - fdust{v:h}={%u:%u}", fake_dev, real_dev);
if( (strlen(param_value)+strlen(fdust_devname)) < malloced_bytes ) {
memcpy(param_value+strlen(param_value), fdust_devname, strlen(fdust_devname)+1); // +1 = include \0 of sprintf()
}
}