forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReduceMinMaxKernel.cu
168 lines (146 loc) · 5.59 KB
/
ReduceMinMaxKernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cuda/Reduce.cuh>
#include <ATen/native/cuda/ReduceOps.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/SharedReduceOps.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/NumericLimits.cuh>
#include <ATen/native/ReduceOps.h>
#include <ATen/native/ReduceAllOps.h>
#include <ATen/native/TensorCompare.h>
#include <ATen/NumericUtils.h>
#include <ATen/Dispatch.h>
#include <ATen/NumericUtils.h>
#include <ATen/cuda/NumericLimits.cuh>
namespace at { namespace native {
template <typename acc_t>
struct MaxNanFunctor {
__device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
return (at::_isnan(a) || a > b) ? a : b;
}
};
template <typename scalar_t, typename acc_t=scalar_t>
void max_values_kernel_cuda_impl(TensorIterator& iter) {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter, func_wrapper<acc_t> (MaxNanFunctor<acc_t>()),
at::numeric_limits<acc_t>::lower_bound());
}
template <typename acc_t>
struct MinNanFunctor {
__device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
return (at::_isnan(a) || a < b) ? a : b;
}
};
template <typename scalar_t, typename acc_t=scalar_t>
void min_values_kernel_cuda_impl(TensorIterator& iter) {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter, func_wrapper<acc_t> (MinNanFunctor<acc_t>()),
at::numeric_limits<acc_t>::upper_bound());
}
void max_values_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
max_values_kernel_cuda_impl<scalar_t>(iter);
});
}
void min_values_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
min_values_kernel_cuda_impl<scalar_t>(iter);
});
}
template <typename scalar_t, typename acc_t=scalar_t>
void argmax_kernel_cuda_impl(TensorIterator& iter) {
gpu_reduce_kernel<scalar_t, int64_t>(
iter,
ArgMaxOps<acc_t>{},
thrust::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::lower_bound(), 0));
};
template <typename scalar_t, typename acc_t=scalar_t>
void argmin_kernel_cuda_impl(TensorIterator& iter) {
gpu_reduce_kernel<scalar_t, int64_t>(
iter,
ArgMinOps<acc_t>{},
thrust::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::upper_bound(), 0));
};
void argmax_kernel_cuda(TensorIterator& iter) {
// For float16 & bfloat16, instead of implementing is_nan and warp_shfl_down,
// we can convert float16 & bfloat16 to float and do all the operations in float.
if (iter.dtype(1) == kHalf) {
argmax_kernel_cuda_impl<at::Half, float>(iter);
} else if (iter.dtype(1) == kBFloat16) {
argmax_kernel_cuda_impl<at::BFloat16, float>(iter);
} else {
AT_DISPATCH_ALL_TYPES(iter.dtype(1), "argmax_cuda", [&]() {
argmax_kernel_cuda_impl<scalar_t>(iter);
});
}
}
void argmin_kernel_cuda(TensorIterator& iter) {
// For float16 & bfloat16, instead of implementing is_nan and warp_shfl_down,
// we can convert float16 & bfloat16 to float and do all the operations in float.
if (iter.dtype(1) == kHalf) {
argmin_kernel_cuda_impl<at::Half, float>(iter);
} else if (iter.dtype(1) == kBFloat16) {
argmin_kernel_cuda_impl<at::BFloat16, float>(iter);
} else {
AT_DISPATCH_ALL_TYPES(iter.dtype(1), "argmin_cuda", [&]() {
argmin_kernel_cuda_impl<scalar_t>(iter);
});
}
}
void min_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
MinOps<scalar_t>{},
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
});
}
void max_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
MaxOps<scalar_t>{},
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::lower_bound(), 0));
});
}
void aminmax_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
MinMaxOps<scalar_t, scalar_t, int32_t>{},
thrust::pair<scalar_t, scalar_t>(
at::numeric_limits<scalar_t>::upper_bound(),
at::numeric_limits<scalar_t>::lower_bound()
)
);
});
}
void min_all_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
min_values_kernel_cuda_impl<scalar_t>(iter);
});
}
void max_all_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
max_values_kernel_cuda_impl<scalar_t>(iter);
});
}
template <typename scalar_t>
void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter, MinMaxOps<scalar_t, scalar_t, int32_t>{}, thrust::pair<scalar_t, scalar_t>(
at::numeric_limits<scalar_t>::upper_bound(),
at::numeric_limits<scalar_t>::lower_bound()
));
}
void aminmax_allreduce_launch_kernel(TensorIterator &iter) {
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
_min_max_values_kernel_cuda_impl<scalar_t>(iter);
});
}
REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda);
REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda);
REGISTER_DISPATCH(argmax_stub, &argmax_kernel_cuda);
REGISTER_DISPATCH(argmin_stub, &argmin_kernel_cuda);
}} // namespace at::native