-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcudalib.t
272 lines (225 loc) · 7.46 KB
/
cudalib.t
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
terralib.includepath = terralib.includepath..";/usr/local/cuda/include"
local C = terralib.includecstring [[
#include "cuda_runtime.h"
#include <stdlib.h>
#include <stdio.h>
]]
local thread_id = cudalib.nvvm_read_ptx_sreg_tid_x
local block_dim = cudalib.nvvm_read_ptx_sreg_ntid_x
local block_id = cudalib.nvvm_read_ptx_sreg_ctaid_x
local NUM_THREADS = 64
local cuda = {}
cuda.index = terra()
return thread_id() + block_id() * block_dim()
end
cuda.make_map_kernel = function(func)
local ltype = func:gettype().parameters[1].type
local kernel = terra(device_arr : <ype)
func(device_arr, cuda.index())
end
return terralib.cudacompile({kernel = kernel}).kernel
end
cuda.map = function(func)
local mapper = cuda.make_map_kernel(func)
local ltype = func:gettype().parameters[1].type
return terra(host_arr : <ype, N : int)
var cuda_arr : <ype
var params : terralib.CUDAParams
if N < NUM_THREADS then
params = terralib.CUDAParams { 1,1,1, N,1,1, 0, nil }
else
params = terralib.CUDAParams { N/NUM_THREADS,1,1, NUM_THREADS,1,1, 0, nil }
end
C.cudaMalloc([&&opaque](&cuda_arr), sizeof(ltype) * N)
C.cudaMemcpy(cuda_arr, host_arr, sizeof(ltype) * N, 1)
mapper(¶ms, cuda_arr)
C.cudaDeviceSynchronize()
C.cudaMemcpy(host_arr, cuda_arr, sizeof(ltype) * N, 2)
end
end
cuda.fixed_map = function(func, host_arr, N)
local mapper = cuda.make_map_kernel(func)
local ltype = func:gettype().parameters[1]
local init = terra()
var cuda_arr : <ype
C.cudaMalloc([&&opaque](&cuda_arr), sizeof(ltype) * N)
return cuda_arr
end
local cuda_arr = init()
return terra()
var params = terralib.CUDAParams { N/NUM_THREADS,1,1, NUM_THREADS,1,1, 0, nil }
C.cudaMemcpy(cuda_arr, host_arr, sizeof(ltype) * N, 1)
mapper(¶ms, cuda_arr)
C.cudaDeviceSynchronize()
C.cudaMemcpy(host_arr, cuda_arr, sizeof(ltype) * N, 2)
end
end
cuda.lua_map = function(func)
local map = cuda.map(func)
local ltype = func:gettype().parameters[1].type -- assume first arg is array
local new = terra(N : int) return C.malloc(sizeof(ltype) * N) end
local copy = terra(a : <ype, b : ltype, i : int) a[i] = b end
local get = terra(a : <ype, i : int) return a[i] end
local free = terra(a : <ype) C.free(a) end
return function(list)
local N = #list
local arr = new(N)
for i = 1, N do
copy(arr, list[i], i - 1)
end
map(arr, N)
for i = 1, N do
local temp = get(arr, i - 1)
if ltype:isstruct() then
for _, v in pairs(ltype.entries) do
list[i][v.field] = temp[v.field]
end
else
list[i] = temp
end
end
free(arr)
end
end
local arrays = {}
cuda.alloc = terralib.cast(int -> &int, function(N)
local init_host = terra() return [&int](C.malloc(N)) end
local init_cuda = terra()
var cuda_arr : &int
C.cudaMalloc([&&opaque](&cuda_arr), N)
return cuda_arr
end
local host_arr = init_host()
local cuda_arr = init_cuda()
table.insert(arrays, {host = host_arr, cuda = cuda_arr, size = N})
return host_arr
end)
cuda.free = function(ptr)
for _, v in pairs(arrays) do
if v.host == ptr then
local free = terra()
C.free(v.host)
C.cudaFree(v.cuda)
end
free()
end
end
end
cuda.alloc_device = terra(N : int) : &int
var cuda_arr : &int
C.cudaMalloc([&&opaque](&cuda_arr), N)
return cuda_arr
end
cuda.device_free = terra(ptr : &opaque)
C.cudaFree(ptr)
end
cuda.make_kernel = function(func)
-- assume that our function takes one argument which is a ptr to a struct
local sptr_type = func:gettype().parameters[1]
if not (sptr_type:ispointertostruct()) then
error('Type must be a struct')
end
local s_type = sptr_type.type
local params = terralib.new(s_type)
local init = terra()
var cuda_params : sptr_type
C.cudaMalloc([&&opaque](&cuda_params), sizeof(s_type))
return cuda_params
end
local cuda_params = init()
local func_wrapper = terra(A : sptr_type)
func(A, cuda.index())
end
local kernel = terralib.cudacompile({kernel = func_wrapper}).kernel
return function(p, N)
for _, entry in pairs(s_type.entries) do
if entry.type:ispointer() then
local found = false
for _, v in pairs(arrays) do
if v.host == p[entry.field] then
params[entry.field] = terralib.cast(entry.type, v.cuda)
local copy = terra() C.cudaMemcpy(v.cuda, v.host, v.size, 1) end
copy()
found = true
break
end
end
if not found then params[entry.field] = p[entry.field] end
else
params[entry.field] = p[entry.field]
end
end
local launch = terra()
var launch_params : terralib.CUDAParams
if N < NUM_THREADS then
launch_params = terralib.CUDAParams { 1,1,1, N,1,1, 0, nil }
else
launch_params = terralib.CUDAParams { (N/NUM_THREADS)+1,1,1, NUM_THREADS,1,1, 0, nil }
end
C.cudaMemcpy(cuda_params, ¶ms, sizeof(s_type), 1)
kernel(&launch_params, cuda_params)
C.cudaDeviceSynchronize()
C.cudaMemcpy(¶ms, cuda_params, sizeof(s_type), 2)
end
launch()
for _, entry in pairs(s_type.entries) do
if entry.type:ispointer() then
for _, v in pairs(arrays) do
if v.host == p[entry.field] then
local copy = terra() C.cudaMemcpy(v.host, v.cuda, v.size, 2) end
copy()
end
end
else
p[entry.field] = params[entry.field]
end
end
end
end
function make_array(typ, N)
local init = terra() return [typ](cuda.alloc(sizeof(typ.type) * N)) end
return init()
end
cuda.lua_make_kernel = function(func)
local kernel = cuda.make_kernel(func)
local stype = func:gettype().parameters[1].type -- assume first arg is struct pointer
local sval = terralib.new(stype)
return function(P, N)
for _, entry in pairs(stype.entries) do
if entry.type:ispointer() then -- assume it's an array
local arr = P[entry.field]
local tarr = make_array(entry.type, #arr)
sval[entry.field] = tarr
local set = terra(i : int, v : entry.type.type) tarr[i] = v end
for i = 1, #arr do set(i - 1, arr[i]) end
else
sval[entry.field] = P[entry.field]
end
end
kernel(sval, N)
for _, entry in pairs(stype.entries) do
if entry.type:ispointer() then -- assume it's an array
local arr = P[entry.field]
local tarr = sval[entry.field]
local get = terra(i : int) return tarr[i] end
for i = 1, #arr do arr[i] = get(i - 1) end
else
P[entry.field] = sval[entry.field]
end
end
end
end
cuda.make_struct_type = function(t)
local st = terralib.types.newstruct("custom_struct")
local type_map = {
string = &int8,
number = int, -- TODO: distinguish between floats/doubles?
boolean = bool,
table = &int
}
for k, v in pairs(t) do
table.insert(st.entries, { field = k, type = type_map[type(v)] })
end
return st
end
return cuda