forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTHCTensorSort.cuh
88 lines (73 loc) · 2.72 KB
/
THCTensorSort.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#ifndef THC_TENSORSORT_CUH
#define THC_TENSORSORT_CUH
#include <THC/THCTensorMath.h>
#include <THC/THCGeneral.h>
#include <THC/THCReduceApplyUtils.cuh>
#include <THC/THCSortUtils.cuh>
#include <THC/THCTensorCopy.h>
#include <THC/THCTensorTypeUtils.cuh>
#include <THC/THCThrustAllocator.cuh>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#if CUDA_VERSION >= 7000 || defined(__HIP_PLATFORM_HCC__)
#include <thrust/system/cuda/execution_policy.h>
#endif
template <typename T, bool handleNaN = false>
struct ThrustGTOp {
__device__ bool operator()(const T& lhs, const T& rhs) const {
return (handleNaN && THCNumerics<T>::isnan(lhs) && !THCNumerics<T>::isnan(rhs)) || THCNumerics<T>::gt(lhs, rhs);
}
};
template <typename T, bool handleNaN = false>
struct ThrustLTOp {
__device__ bool operator()(const T& lhs, const T& rhs) const {
return (handleNaN && THCNumerics<T>::isnan(rhs) && !THCNumerics<T>::isnan(lhs)) || THCNumerics<T>::lt(lhs, rhs);
}
};
// `base` is the base address of a tensor
// For each slice (defined as a linear point of `out`, from 0 ->
// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
// `sliceSize - 1`.
template <typename IndexType, int Dim>
__global__ void
fillSliceWithIndex(TensorInfo<int64_t, IndexType> out,
IndexType totalSlices,
IndexType sliceSize,
IndexType sliceStride) {
IndexType slice = getLinearBlockId<IndexType>();
if (slice >= totalSlices) {
return;
}
const uint64_t offset =
IndexToOffset<int64_t, IndexType, Dim>::get(slice, out);
int64_t* base = &out.data[offset];
for (int64_t i = threadIdx.x; i < sliceSize; i += blockDim.x) {
// Torch indices are 1-based (hence the +1)
base[i * sliceStride] = i + TH_INDEX_BASE;
}
}
// For slice sorting in Thrust; extracts a slice index from a linear
// index and uses that for comparison
struct SliceComp {
SliceComp(int64_t size) : sliceSize(size) {}
__device__ bool operator()(const int64_t& a, const int64_t& b) const {
// Since the slices are guaranteed to be innermost,
// the segment is just via int64_t division
int64_t segA = a / sliceSize;
int64_t segB = b / sliceSize;
return segA < segB;
}
const int64_t sliceSize;
};
// For sorting in Thurst; extracts a within-slice index from a linear index
struct GlobalIndexToPerSliceIndex {
GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {}
__device__ inline void operator()(int64_t& v) const {
v = v % sliceSize + TH_INDEX_BASE;
}
const int64_t sliceSize;
};
void THCudaLongTensor_fillSliceWithIndex(THCState* state,
THCudaLongTensor* t,
int dim);
#endif // THC_TENSORSORT_CUH