-
Notifications
You must be signed in to change notification settings - Fork 75
/
Copy pathTraits.hpp
314 lines (292 loc) · 13.7 KB
/
Traits.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
* Andrea Bocci, Aurora Perego
* SPDX-License-Identifier: MPL-2.0
*/
#pragma once
#include "alpaka/core/BoostPredef.hpp"
#include "alpaka/core/Common.hpp"
#include "alpaka/core/Debug.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/core/OmpSchedule.hpp"
#include "alpaka/dim/Traits.hpp"
#include "alpaka/idx/Traits.hpp"
#include "alpaka/queue/Traits.hpp"
#include "alpaka/vec/Vec.hpp"
#include "alpaka/workdiv/Traits.hpp"
#include <type_traits>
//! The alpaka accelerator library.
namespace alpaka
{
//! The kernel traits.
namespace trait
{
//! The kernel execution task creation trait.
template<
typename TAcc,
typename TWorkDiv,
typename TKernelFnObj,
typename... TArgs/*,
typename TSfinae = void*/>
struct CreateTaskKernel;
//! The trait for getting the size of the block shared dynamic memory of a kernel.
//!
//! \tparam TKernelFnObj The kernel function object.
//! \tparam TAcc The accelerator.
//!
//! The default implementation returns 0.
template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
struct BlockSharedMemDynSizeBytes
{
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
//! \param blockThreadExtent The block thread extent.
//! \param threadElemExtent The thread element extent.
//! \tparam TArgs The kernel invocation argument types pack.
//! \param args,... The kernel invocation arguments.
//! \return The size of the shared memory allocated for a block in bytes.
//! The default version always returns zero.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
ALPAKA_NO_HOST_ACC_WARNING
template<typename TDim, typename... TArgs>
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
[[maybe_unused]] TKernelFnObj const& kernelFnObj,
[[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
[[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
[[maybe_unused]] TArgs const&... args) -> std::size_t
{
return 0u;
}
};
//! The trait for getting the warp size required by a kernel.
//!
//! \tparam TKernelFnObj The kernel function object.
//! \tparam TAcc The accelerator.
//!
//! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
struct WarpSize : std::integral_constant<std::uint32_t, 0>
{
};
//! This is a shortcut for the trait defined above
template<typename TKernelFnObj, typename TAcc>
inline constexpr std::uint32_t warpSize = WarpSize<TKernelFnObj, TAcc>::value;
//! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
//!
//! Has no effect on other accelerators.
//!
//! A user could either specialize this trait for their kernel, or define a public static member
//! ompScheduleKind of type alpaka::omp::Schedule, and additionally also int member ompScheduleChunkSize. In
//! the latter case, alpaka never odr-uses these members.
//!
//! In case schedule kind and chunk size are compile-time constants, setting then inside kernel may benefit
//! performance.
//!
//! \tparam TKernelFnObj The kernel function object.
//! \tparam TAcc The accelerator.
//!
//! The default implementation behaves as if the trait was not specialized.
template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
struct OmpSchedule
{
private:
//! Type returned when the trait is not specialized
struct TraitNotSpecialized
{
};
public:
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! \param kernelFnObj The kernel object for which the schedule should be returned.
//! \param blockThreadExtent The block thread extent.
//! \param threadElemExtent The thread element extent.
//! \tparam TArgs The kernel invocation argument types pack.
//! \param args,... The kernel invocation arguments.
//! \return The OpenMP schedule information as an alpaka::omp::Schedule object,
//! returning an object of any other type is treated as if the trait is not specialized.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
ALPAKA_NO_HOST_ACC_WARNING
template<typename TDim, typename... TArgs>
ALPAKA_FN_HOST static auto getOmpSchedule(
[[maybe_unused]] TKernelFnObj const& kernelFnObj,
[[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
[[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
[[maybe_unused]] TArgs const&... args) -> TraitNotSpecialized
{
return TraitNotSpecialized{};
}
};
} // namespace trait
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! \tparam TAcc The accelerator type.
//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
//! \param blockThreadExtent The block thread extent.
//! \param threadElemExtent The thread element extent.
//! \param args,... The kernel invocation arguments.
//! \return The size of the shared memory allocated for a block in bytes.
//! The default implementation always returns zero.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
TKernelFnObj const& kernelFnObj,
Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
Vec<TDim, Idx<TAcc>> const& threadElemExtent,
TArgs const&... args) -> std::size_t
{
return trait::BlockSharedMemDynSizeBytes<TKernelFnObj, TAcc>::getBlockSharedMemDynSizeBytes(
kernelFnObj,
blockThreadExtent,
threadElemExtent,
args...);
}
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! \tparam TAcc The accelerator type.
//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
//! \param blockThreadExtent The block thread extent.
//! \param threadElemExtent The thread element extent.
//! \param args,... The kernel invocation arguments.
//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
//! OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
ALPAKA_FN_HOST auto getOmpSchedule(
TKernelFnObj const& kernelFnObj,
Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
Vec<TDim, Idx<TAcc>> const& threadElemExtent,
TArgs const&... args)
{
return trait::OmpSchedule<TKernelFnObj, TAcc>::getOmpSchedule(
kernelFnObj,
blockThreadExtent,
threadElemExtent,
args...);
}
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! Check if a type used as kernel argument is trivially copyable
//!
//! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
//! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
//! of side effects.
//!
//! It's implementation defined whether the closure type of a lambda is trivially copyable.
//! Therefor the default implementation is true for trivially copyable or empty (stateless) types.
//!
//! @tparam T type to check
//! @{
template<typename T, typename = void>
struct IsKernelArgumentTriviallyCopyable
: std::bool_constant<std::is_empty_v<T> || std::is_trivially_copyable_v<T>>
{
};
template<typename T>
inline constexpr bool isKernelArgumentTriviallyCopyable = IsKernelArgumentTriviallyCopyable<T>::value;
//! @}
namespace detail
{
//! Check that the return of TKernelFnObj is void
template<typename TAcc, typename TSfinae = void>
struct CheckFnReturnType
{
template<typename TKernelFnObj, typename... TArgs>
void operator()(TKernelFnObj const&, TArgs const&...)
{
using Result = std::invoke_result_t<TKernelFnObj, TAcc const&, TArgs const&...>;
static_assert(std::is_same_v<Result, void>, "The TKernelFnObj is required to return void!");
}
};
// asserts that T is trivially copyable. We put this in a separate function so we can see which T would fail
// the test, when called from a fold expression.
template<typename T>
inline void assertKernelArgIsTriviallyCopyable()
{
static_assert(isKernelArgumentTriviallyCopyable<T>, "The kernel argument T must be trivially copyable!");
}
} // namespace detail
//! Creates a kernel execution task.
//!
//! \tparam TAcc The accelerator type.
//! \param workDiv The index domain work division.
//! \param kernelFnObj The kernel function object which should be executed.
//! \param args,... The kernel invocation arguments.
//! \return The kernel execution task.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
template<typename TAcc, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
{
// check for void return type
detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
#if BOOST_COMP_NVCC
static_assert(
std::is_trivially_copyable_v<TKernelFnObj> || __nv_is_extended_device_lambda_closure_type(TKernelFnObj)
|| __nv_is_extended_host_device_lambda_closure_type(TKernelFnObj),
"Kernels must be trivially copyable or an extended CUDA lambda expression!");
#else
static_assert(std::is_trivially_copyable_v<TKernelFnObj>, "Kernels must be trivially copyable!");
#endif
(detail::assertKernelArgIsTriviallyCopyable<std::decay_t<TArgs>>(), ...);
static_assert(
Dim<std::decay_t<TWorkDiv>>::value == Dim<TAcc>::value,
"The dimensions of TAcc and TWorkDiv have to be identical!");
static_assert(
std::is_same_v<Idx<std::decay_t<TWorkDiv>>, Idx<TAcc>>,
"The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
std::cout << __func__ << " workDiv: " << workDiv << ", kernelFnObj: " << core::demangled<decltype(kernelFnObj)>
<< std::endl;
#endif
return trait::CreateTaskKernel<TAcc, TWorkDiv, TKernelFnObj, TArgs...>::createTaskKernel(
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...);
}
#if BOOST_COMP_CLANG
# pragma clang diagnostic push
# pragma clang diagnostic ignored \
"-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
#endif
//! Executes the given kernel in the given queue.
//!
//! \tparam TAcc The accelerator type.
//! \param queue The queue to enqueue the view copy task into.
//! \param workDiv The index domain work division.
//! \param kernelFnObj The kernel function object which should be executed.
//! \param args,... The kernel invocation arguments.
#if BOOST_COMP_CLANG
# pragma clang diagnostic pop
#endif
template<typename TAcc, typename TQueue, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
ALPAKA_FN_HOST auto exec(TQueue& queue, TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-> void
{
enqueue(queue, createTaskKernel<TAcc>(workDiv, kernelFnObj, std::forward<TArgs>(args)...));
}
} // namespace alpaka