Skip to content

Commit

Permalink
Removing some relaxed constexpr requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffburdick committed May 31, 2024
1 parent 9d2ad17 commit 8b099d7
Show file tree
Hide file tree
Showing 22 changed files with 85 additions and 89 deletions.
4 changes: 3 additions & 1 deletion include/matx/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@

#include "matx/core/error.h"
#include "matx/core/nvtx.h"
#include <cuda/std/__algorithm>
#include <cuda/std/__algorithm>

#pragma once

Expand Down Expand Up @@ -203,7 +205,7 @@ struct MemTracker {
[[maybe_unused]] std::unique_lock lck(memory_mtx);
matxMemoryStats.currentBytesAllocated += bytes;
matxMemoryStats.totalBytesAllocated += bytes;
matxMemoryStats.maxBytesAllocated = std::max(
matxMemoryStats.maxBytesAllocated = cuda::std::max(
matxMemoryStats.maxBytesAllocated, matxMemoryStats.currentBytesAllocated);
allocationMap[*ptr] = {bytes, space, stream};
}
Expand Down
20 changes: 0 additions & 20 deletions include/matx/core/half_complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,26 +147,6 @@ template <typename T> struct alignas(sizeof(T) * 2) matxHalfComplex {
return {x, y};
}

/**
* @brief std::complex<float> cast operator
*
* @return std::complex<float> value
*/
__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ operator std::complex<float>()
{
return {x, y};
}

/**
* @brief std::complex<double> cast operator
*
* @return std::complex<double> value
*/
__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ operator std::complex<double>()
{
return {x, y};
}

/**
* @brief Copy assignment operator
*
Expand Down
16 changes: 9 additions & 7 deletions include/matx/core/pybind.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,12 +437,14 @@ class MatXPybind {
}

template <typename TensorType,
typename CT = matx_convert_complex_type<typename TensorType::scalar_type>>
typename CT = matx_convert_cuda_complex_type<typename TensorType::scalar_type>>
std::optional<TestFailResult<CT>>
CompareOutput(const TensorType &ten,
const std::string fname, double thresh, bool debug = false)
{
using ntype = matx_convert_complex_type<typename TensorType::scalar_type>;
using raw_type = typename TensorType::scalar_type;
using ntype = matx_convert_complex_type<raw_type>;
using ctype = matx_convert_cuda_complex_type<raw_type>;
auto resobj = res_dict[fname.c_str()];
auto ften = pybind11::array_t<ntype>(resobj);
constexpr int RANK = TensorType::Rank();
Expand All @@ -453,7 +455,7 @@ class MatXPybind {
auto file_val = ften.at();
auto ten_val = ConvertComplex(ten());
if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
return TestFailResult<ntype>{Index2Str(0), "0", ten_val, file_val,
return TestFailResult<ctype>{Index2Str(0), "0", ten_val, file_val,
thresh};
}
}
Expand All @@ -468,7 +470,7 @@ class MatXPybind {
auto file_val = ften.at(s1, s2, s3, s4);
auto ten_val = ConvertComplex(ten(s1, s2, s3, s4));
if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
return TestFailResult<ntype>{Index2Str(s1, s2, s3, s4),
return TestFailResult<ctype>{Index2Str(s1, s2, s3, s4),
fname, ten_val, file_val,
thresh};
}
Expand All @@ -478,7 +480,7 @@ class MatXPybind {
auto file_val = ften.at(s1, s2, s3);
auto ten_val = ConvertComplex(ten(s1, s2, s3));
if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
return TestFailResult<ntype>{Index2Str(s1, s2, s3), fname,
return TestFailResult<ctype>{Index2Str(s1, s2, s3), fname,
ten_val, file_val, thresh};
}
}
Expand All @@ -488,7 +490,7 @@ class MatXPybind {
auto file_val = ften.at(s1, s2);
auto ten_val = ConvertComplex(ten(s1, s2));
if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
return TestFailResult<ntype>{Index2Str(s1, s2), fname, ten_val,
return TestFailResult<ctype>{Index2Str(s1, s2), fname, ten_val,
file_val, thresh};
}
}
Expand All @@ -498,7 +500,7 @@ class MatXPybind {
auto file_val = ften.at(s1);
auto ten_val = ConvertComplex(ten(s1));
if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
return TestFailResult<ntype>{Index2Str(s1), fname, ten_val,
return TestFailResult<ctype>{Index2Str(s1), fname, ten_val,
file_val, thresh};
}
}
Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/tensor_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,15 +373,15 @@ class static_tensor_desc_t {
* @param dim Dimension to retrieve
* @return Size of dimension
*/
static constexpr auto Size(int dim) { return shape_[dim]; }
static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) { return shape_[dim]; }

/**
* @brief Get stride of dimension
*
* @param dim Dimension to retrieve
* @return Stride of dimension
*/
static constexpr auto Stride(int dim) { return stride_[dim]; }
static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Stride(int dim) { return stride_[dim]; }

/**
* @brief Return strides contaienr of descriptor
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ namespace matx

for (int i = 1; i < op.Rank(); i++)
{
maxSize = std::max(op.Size(i), maxSize);
maxSize = cuda::std::max(op.Size(i), maxSize);
}

return maxSize;
Expand Down
11 changes: 11 additions & 0 deletions include/matx/core/type_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -800,11 +800,22 @@ struct complex_type_of
typename C::value_type>>> {
};

template <class C>
struct cuda_complex_type_of
: identity<cuda::std::complex<std::conditional_t<is_complex_half_v<C>, float,
typename C::value_type>>> {
};

template <class C>
using matx_convert_complex_type =
typename std::conditional_t<!is_complex_v<C>, identity<C>,
complex_type_of<C>>::type;

template <class C>
using matx_convert_cuda_complex_type =
typename std::conditional_t<!is_complex_v<C>, identity<C>,
cuda_complex_type_of<C>>::type;


template <class T, class = void> struct value_type {
using type = T;
Expand Down
16 changes: 8 additions & 8 deletions include/matx/kernels/channelize_poly.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter

constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
const index_t last_out_elem = std::min(
const index_t last_out_elem = cuda::std::min(
output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);

if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
Expand All @@ -103,7 +103,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter

if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
const index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
const index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
output_t accum {};
const filter_t *h = smem_filter;
// index_t in MatX should be signed (32 or 64 bit), so j-- below will not underflow
Expand Down Expand Up @@ -134,7 +134,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
}
} else {
for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
// If we use the last filter tap for this phase (which is the first index because
// the filter is flipped), then it may be a padded zero. If so, increment first_ind
// by 1 to avoid using the zero. This prevents a bounds-check in the inner loop.
Expand Down Expand Up @@ -227,7 +227,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
const uint32_t smem_input_height = filter_phase_len + by - 1;

const index_t start_elem = blockIdx.x * elems_per_channel_per_cta;
const index_t last_elem = std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
const index_t last_elem = cuda::std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
auto indims = BlockToIdx(input, blockIdx.z, 1);
auto outdims = BlockToIdx(output, blockIdx.z, 2);
outdims[ChannelRank] = chan;
Expand Down Expand Up @@ -256,7 +256,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
__syncthreads();

// Load next elems_per_channel_per_cta elements for each channel
const index_t next_last_elem = std::min(next_start_elem + by - 1, last_elem);
const index_t next_last_elem = cuda::std::min(next_start_elem + by - 1, last_elem);
const uint32_t out_samples_this_iter = static_cast<uint32_t>(next_last_elem - next_start_elem + 1);
if (ty < out_samples_this_iter) {
indims[InRank-1] = (next_start_elem + ty) * num_channels + chan;
Expand Down Expand Up @@ -286,7 +286,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
if (outdims[OutElemRank] <= last_elem) {
const filter_t *h = h_start;
output_t accum { 0 };
const int first_end = std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
const int first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
// The footprint of samples involved in the convolution may wrap from the end
// to the beginning of smem_input. The prologue below handles the samples from
// the current tail to the end of smem_input and the epilogue starts back at the
Expand Down Expand Up @@ -342,7 +342,7 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT

constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
const index_t last_out_elem = std::min(
const index_t last_out_elem = cuda::std::min(
output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);

// Pre-compute the DFT complex exponentials and store in shared memory
Expand Down Expand Up @@ -371,7 +371,7 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
for (int i = 0; i < NUM_CHAN; i++) {
accum[i] = static_cast<output_t>(0);
}
index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
indims[InRank-1] = t * NUM_CHAN + NUM_CHAN - 1;
index_t j_start = t;
index_t h_ind { 0 };
Expand Down
2 changes: 1 addition & 1 deletion include/matx/kernels/filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#define COMPLEX_TYPE cuComplex

// std::max/min isn't working on template value parameters
// cuda::std::max/min isn't working on template value parameters
#define MAX(a, b) ((a) < (b) ? (b) : (a))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

Expand Down
26 changes: 13 additions & 13 deletions include/matx/kernels/resample_poly.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ __global__ void ResamplePoly1D_PhaseBlock(OutType output, InType input, FilterTy
const index_t max_input_ind = input_len - 1;

const index_t start_ind = phase_ind + up * (tid + elem_block * elems_per_thread * THREADS);
const index_t last_ind = std::min(output_len - 1, start_ind + elems_per_thread * THREADS * up);
const index_t last_ind = cuda::std::min(output_len - 1, start_ind + elems_per_thread * THREADS * up);
for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS * up) {
// out_ind is the index in the output array and up_ind = out_ind * down is the
// corresponding index in the upsampled array
Expand All @@ -203,9 +203,9 @@ __global__ void ResamplePoly1D_PhaseBlock(OutType output, InType input, FilterTy
// of valid samples before input_ind. In the case that the filter is not
// long enough to include input_ind, last_filter_ind is left_filter_ind - up
// and thus left_h_ind and prologue are both -1.
const index_t prologue = std::min(input_ind, left_h_ind);
const index_t prologue = cuda::std::min(input_ind, left_h_ind);
// epilogue is the number of valid samples after input_ind.
const index_t epilogue = std::min(max_input_ind - input_ind, max_h_epilogue);
const index_t epilogue = cuda::std::min(max_input_ind - input_ind, max_h_epilogue);
// n is the number of valid samples. If input_ind is not valid because it
// precedes the reach of the filter, then prologue = -1 and n is just the
// epilogue.
Expand Down Expand Up @@ -302,12 +302,12 @@ __global__ void ResamplePoly1D_ElemBlock(OutType output, InType input, FilterTyp
// whether or not the filter has been loaded to shared memory.
const index_t filter_central_tap = (filter_len-1)/2;
const index_t start_ind = elem_block * elems_per_thread * THREADS + tid;
const index_t last_ind = std::min(output_len - 1, start_ind + (elems_per_thread-1) * THREADS);
const index_t last_ind = cuda::std::min(output_len - 1, start_ind + (elems_per_thread-1) * THREADS);
if (load_filter_to_smem) {
for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS) {
const index_t up_ind = out_ind * down;
const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t x_start = (up_start + up - 1) / up;
index_t x_end = up_end / up;
// Since the filter is in shared memory, we can narrow the index type to 32 bits
Expand All @@ -333,8 +333,8 @@ __global__ void ResamplePoly1D_ElemBlock(OutType output, InType input, FilterTyp
} else {
for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS) {
const index_t up_ind = out_ind * down;
const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t x_start = (up_start + up - 1) / up;
index_t x_end = up_end / up;
index_t h_ind = filter_central_tap + (up_ind - up*x_start);
Expand Down Expand Up @@ -409,12 +409,12 @@ __global__ void ResamplePoly1D_WarpCentric(OutType output, InType input, FilterT
const index_t filter_len_half = filter_len/2;
const index_t filter_central_tap = (filter_len-1)/2;
const index_t start_ind = elem_block * elems_per_warp * NUM_WARPS;
const index_t last_ind = std::min(output_len - 1, start_ind + elems_per_warp * NUM_WARPS - 1);
const index_t last_ind = cuda::std::min(output_len - 1, start_ind + elems_per_warp * NUM_WARPS - 1);
if (load_filter_to_smem) {
for (index_t out_ind = start_ind+warp_id; out_ind <= last_ind; out_ind += NUM_WARPS) {
const index_t up_ind = out_ind * down;
const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t x_start = (up_start + up - 1) / up;
index_t x_end = up_end / up;
// Since the filter is in shared memory, we can narrow the index type to 32 bits
Expand Down Expand Up @@ -449,8 +449,8 @@ __global__ void ResamplePoly1D_WarpCentric(OutType output, InType input, FilterT
} else {
for (index_t out_ind = start_ind+warp_id; out_ind <= last_ind; out_ind += NUM_WARPS) {
const index_t up_ind = out_ind * down;
const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
const index_t x_start = (up_start + up - 1) / up;
index_t x_end = up_end / up;
index_t h_ind = filter_central_tap + (up_ind - up*x_start);
Expand Down
20 changes: 10 additions & 10 deletions include/matx/operators/conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace matx
private:
using out_t = std::conditional_t<is_complex_v<typename OpA::scalar_type>,
typename OpA::scalar_type, typename OpB::scalar_type>;
constexpr static int max_rank = std::max(OpA::Rank(), OpB::Rank());
constexpr static int max_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
OpA a_;
OpB b_;
matxConvCorrMode_t mode_;
Expand Down Expand Up @@ -82,8 +82,8 @@ namespace matx
for (int r = 0; r < Rank(); r++) {
const int axis = perm[r];
if (axis == Rank() - 1) {
max_axis = std::max(a_.Size(r), b_.Size(r));
min_axis = std::min(a_.Size(r), b_.Size(r));
max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
min_axis = cuda::std::min(a_.Size(r), b_.Size(r));

if (mode_ == MATX_C_MODE_FULL) {
out_dims_[axis] = a_.Size(r) + b_.Size(r) - 1;
Expand Down Expand Up @@ -112,8 +112,8 @@ namespace matx
}
}

max_axis = std::max(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
min_axis = std::min(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
max_axis = cuda::std::max(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
min_axis = cuda::std::min(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));

if (mode_ == MATX_C_MODE_FULL) {
out_dims_[max_rank-1] = max_axis + min_axis - 1;
Expand Down Expand Up @@ -231,7 +231,7 @@ namespace detail {
private:
using out_t = std::conditional_t<is_complex_v<typename OpA::scalar_type>,
typename OpA::scalar_type, typename OpB::scalar_type>;
constexpr static int max_rank = std::max(OpA::Rank(), OpB::Rank());
constexpr static int max_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
OpA a_;
OpB b_;
matxConvCorrMode_t mode_;
Expand All @@ -257,8 +257,8 @@ namespace detail {
for (int r = 0; r < Rank(); r++) {
const int axis = perm[r];
if (axis >= Rank() - 2) {
const auto max_axis = std::max(a_.Size(r), b_.Size(r));
const auto min_axis = std::min(a_.Size(r), b_.Size(r));
const auto max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
const auto min_axis = cuda::std::min(a_.Size(r), b_.Size(r));
if (mode_ == MATX_C_MODE_FULL) {
out_dims_[axis] = a_.Size(r) + b_.Size(r) - 1;
}
Expand Down Expand Up @@ -287,8 +287,8 @@ namespace detail {
}

for (int r = max_rank - 2; r < max_rank; r++) {
const auto max_axis = std::max(a_.Size(r), b_.Size(r));
const auto min_axis = std::min(a_.Size(r), b_.Size(r));
const auto max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
const auto min_axis = cuda::std::min(a_.Size(r), b_.Size(r));
if (mode_ == MATX_C_MODE_FULL) {
out_dims_[r] = max_axis + min_axis - 1;
}
Expand Down
Loading

0 comments on commit 8b099d7

Please sign in to comment.