-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcudnn-tensor.rkt
222 lines (203 loc) · 7.38 KB
/
cudnn-tensor.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
;; Tensors seem to be very domain specific
;; This package provides utilities for managing
;; cudnn-tensors
;;Typed tensors
#lang typed/racket
(require
"ffi-functional.rkt"
"cudnn-api.rkt"
"cuda-api.rkt"
"mem-utils.rkt"
math/base
math/array
math/matrix)
(require racket/flonum)
;; define Tensor struct
;; Provide an array, then store the FlVector of it and pointer
;; Create a gpu vector of size for the representation
;; We then memcpy between this vector and gpu vector to return
;; values.
;; desc stores the tensordescriptor
(struct cudnn-tensor ([in-array : (Array Flonum)]
[in-vect : FlVector]
[src-ptr : CPointer]
[gpu-ptr : CPointer]
[size : Exact-Nonnegative-Integer]
;; descriptive pointer
[desc : CPointer]))
;; Tensor
;; [ [ [1 2 3] [3 4 5] [3 4 5] [3 4 5] ] [ [5 6 7] [5 6 7] [5 6 7] [7 8 9]]]
;; batch size is 2, sequence length is 4
;; input size is 3
;; For input layers
(: make-cudnn-input-tensor (-> (Array Flonum) cudnn-tensor))
(define (make-cudnn-input-tensor arr)
(match-let* ([ptr (get-pointer)]
[(vector batch-size input-size sequence-size) (array-shape arr)]
[size (* batch-size input-size sequence-size DOUBLE-SIZE)]
;;cudaMalloc expects the address of the pointer
[resAlloc (cudaMalloc ptr size)]
)
(display (format "Result of gpu allocation request is ~a" resAlloc))
(let ([tens
(cudnn-tensor arr
(flarray-data (array->flarray arr))
(array->cptr arr)
(dref-ptr ptr)
size
;; this should be an array of the descriptors
(init-input-tensor-descriptor batch-size input-size sequence-size))])
(print "Initialized a tensor")
tens)))
;; For simple tensors
(: make-cudnn-simple-tensor (-> (Array Flonum) cudnn-tensor))
(define (make-cudnn-simple-tensor arr)
(match-let* ([ptr (get-pointer)]
[(vector batch-size input-size sequence-size) (array-shape arr)]
[size (* batch-size input-size sequence-size DOUBLE-SIZE)]
;;cudaMalloc expects the address of the pointer
[resAlloc (cudaMalloc ptr size)]
)
(display (format "Result of gpu allocation request is ~a" resAlloc))
(let ([tens
(cudnn-tensor arr
(flarray-data (array->flarray arr))
(array->cptr arr)
(dref-ptr ptr)
size
(init-simple-tensor-descriptor batch-size input-size sequence-size))])
(print "Initialized a tensor")
tens)))
;; Creates the tensor descriptor
(: init-input-tensor-descriptor (-> Nonnegative-Integer
Nonnegative-Integer
Nonnegative-Integer
CPointer))
(define (init-input-tensor-descriptor batch-size input-size sequence-size)
(let
(
;; allocate a block of tensor descriptors
[tensor-descriptor-array (cuda-create-tensor-descriptr-ptr sequence-size)]
[dimA (get-int-array-block (list 3))]
[strideA (get-int-array-block (list 3))]
)
(copy-int-to-block dimA (list batch-size input-size 1))
(copy-int-to-block strideA (list input-size 1 1))
(for ([desc-idx : Nonnegative-Integer (in-range sequence-size)])
(let
([desc-ptr (get-tensor-desc-ptr tensor-descriptor-array desc-idx)])
(cudnnCreateTensorDescriptor desc-ptr) ;;address of the tensor descriptor
(cudnnSetTensorNdDescriptor
(dref-tensor-desc-ptr desc-ptr) ;; the tensor descriptor
0
3
dimA
strideA)))
tensor-descriptor-array))
;; Allocate a tensor descriptor
(: init-simple-tensor-descriptor (-> Nonnegative-Integer
Nonnegative-Integer
Nonnegative-Integer CPointer))
(define (init-simple-tensor-descriptor layers batch-size hidden-size)
(let
(
[tensor-descriptor-array (cuda-create-tensor-descriptr-ptr 1)]
[dimA (get-int-array-block (list 3))]
[strideA (get-int-array-block (list 3))]
)
(print "Attempting copy")
(copy-int-to-block dimA (list layers batch-size hidden-size))
(copy-int-to-block strideA (list (* batch-size hidden-size) hidden-size 1))
(print "Finished copy, attempting descriptor creation")
(cudnnCreateTensorDescriptor tensor-descriptor-array)
(print "Setting the descriptor")
(let ([tensor-desc (dref-tensor-desc-ptr tensor-descriptor-array)])
(cudnnSetTensorNdDescriptor tensor-desc 0 3
dimA strideA)
(print "Completed setting the descriptor")
tensor-desc)))
(: make-cudnn-layer-tensor (-> Index Index
Index cudnn-tensor))
(define (make-cudnn-layer-tensor num-layers hidden-size mini-batch)
(make-cudnn-simple-tensor
(make-array
(ann ((inst vector Index) num-layers hidden-size mini-batch) In-Indexes)
1.0)))
;; Allocate an array of tensor descriptors for each training iteration
(: init-layer-tensor-descriptor (-> Nonnegative-Integer
Nonnegative-Integer
Nonnegative-Integer CPointer))
(define (init-layer-tensor-descriptor layers batch-size hidden-size)
(let
(
[tensor-descriptor-array (cuda-create-tensor-descriptr-ptr 1)]
[dimA (get-int-array-block (list 3))]
[strideA (get-int-array-block (list 3))]
)
(copy-int-to-block dimA (list layers batch-size hidden-size))
(copy-int-to-block strideA (list (* batch-size hidden-size) hidden-size 1))
(cudnnCreateTensorDescriptor tensor-descriptor-array)
(cudnnSetTensorNdDescriptor (dref-tensor-desc-ptr tensor-descriptor-array) 0 3
dimA strideA)
(dref-tensor-desc-ptr tensor-descriptor-array)))
;; allocate the gpu memory
(: tensor-allocate-gpu (cudnn-tensor -> cudnn-tensor))
(define (tensor-allocate-gpu tensor)
tensor)
;; copy cpu memory to gpu
(: tensor-cpu->gpu (cudnn-tensor -> Symbol))
(define (tensor-cpu->gpu tensor)
(display (format "Starting copy to GPU with:\n" ))
(print-double-block (cudnn-tensor-src-ptr tensor) (flvector-length (cudnn-tensor-in-vect tensor)))
(cudaDeviceSynchronize)
(cuda-host-to-device-copy (cudnn-tensor-gpu-ptr tensor) (cudnn-tensor-src-ptr tensor)
(cudnn-tensor-size tensor))
;;(cudnn-tensor-size tensor)
)
;; copy gpu memory to cpu
(: tensor-gpu->cpu (cudnn-tensor -> Symbol))
(define (tensor-gpu->cpu tensor)
(display (format "Starting copy from GPU\n"))
(cuda-device-to-host-copy
(cudnn-tensor-src-ptr tensor)
(cudnn-tensor-gpu-ptr tensor)
(cudnn-tensor-size tensor)))
;; run the cudnnAddTensor function
(: tensor-add-tensors (CPointer cudnn-tensor cudnn-tensor -> Symbol))
(define (tensor-add-tensors handle tensor-a tensor-b)
(let*
([alpha (get-double-array-block (list 1))]
[beta (get-double-array-block (list 1))])
(copy-double-to-block alpha (list 1.0))
(display (format "Alpha is ~a\n" (print-double-block alpha 1)))
(copy-double-to-block beta (list 1.0))
(display (format "Beta is ~a\n" (print-double-block beta 1)))
(cudnnAddTensor handle
alpha
(cudnn-tensor-desc tensor-a)
(cudnn-tensor-gpu-ptr tensor-a)
beta
(cudnn-tensor-desc tensor-b)
(cudnn-tensor-gpu-ptr tensor-b))))
;; print the values in a tensor
(define (tensor-print-values [tensor : cudnn-tensor])
(print-double-block (cudnn-tensor-src-ptr tensor) (flvector-length (cudnn-tensor-in-vect tensor)))
tensor)
(provide
tensor-print-values
tensor-add-tensors
tensor-gpu->cpu
tensor-cpu->gpu
tensor-allocate-gpu
init-layer-tensor-descriptor
init-input-tensor-descriptor
cudnn-tensor
make-cudnn-input-tensor
init-simple-tensor-descriptor
cudnn-tensor-src-ptr
cudnn-tensor-size
cudnn-tensor-desc
cudnn-tensor?
make-cudnn-layer-tensor
cudnn-tensor-gpu-ptr
)