forked from wang-xinyu/tensorrtx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMaskRcnnInference.cu
executable file
·63 lines (54 loc) · 2.34 KB
/
MaskRcnnInference.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include "MaskRcnnInferencePlugin.h"
#include "macros.h"
namespace nvinfer1 {
__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }
__global__ void MaskRcnnInferenceKernel(
const int nthreads,
const int detections_per_im,
const int output_size,
const int num_classes,
const float* indices,
const float* masks,
float* out_masks) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) {
int ind = index / output_size / output_size / num_classes;
int ind_class = indices[ind];
int cur_class = index / output_size / output_size % num_classes;
if (ind_class == cur_class) {
int w = index % output_size;
int h = index / output_size % output_size;
int tmp = ind * num_classes * output_size * output_size +
cur_class * output_size*output_size + h * output_size + w;
float maskVal = masks[ind * num_classes * output_size *
output_size + cur_class * output_size * output_size +
h * output_size + w];
out_masks[ind * output_size * output_size + h * output_size + w] = Logist(maskVal);
}
}
}
int maskRcnnInference(int batchSize,
const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs,
int detections_per_im, int output_size, int num_classes, cudaStream_t stream) {
for (int batch = 0; batch < batchSize; batch++) {
auto in_indices = static_cast<const float *>(inputs[0]) + batch * detections_per_im;
auto in_masks = static_cast<const float *>(inputs[1]) + batch * detections_per_im *
num_classes * output_size * output_size;
auto out_masks = static_cast<float *>(outputs[0]) + batch * detections_per_im * output_size * output_size;
int nthreads = detections_per_im * num_classes * output_size * output_size;
const int max_threads = 1024;
int blocksPerGrid = ceil(static_cast<float>(nthreads) / max_threads);
// TODO: can implement this function with thrust?
MaskRcnnInferenceKernel << <blocksPerGrid, max_threads, 0, stream >> > (
nthreads,
detections_per_im,
output_size,
num_classes,
in_indices,
in_masks,
out_masks);
cudaDeviceSynchronize();
}
return 0;
}
} // namespace nvinfer1