Skip to content

Commit

Permalink
Merge pull request #106 from janciesko/nbi_ops
Browse files Browse the repository at this point in the history
Add double3 support to nvshmem and non-blocking op variants
Makes local_deep_copy work correctly with subviews of subviews
Adds non-blocking ops and non-blocking block ops to nvshmem
Add missing nvshmem_quiet ops
  • Loading branch information
janciesko authored Sep 5, 2024
2 parents 94003ba + fdbe0e8 commit 4ce2925
Show file tree
Hide file tree
Showing 11 changed files with 269 additions and 127 deletions.
11 changes: 6 additions & 5 deletions applications/cgsolve/rma/cgsolve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ int main(int argc, char *argv[]) {
// Allocate global size (runtime splits into chunks)
RemoteView_t p = RemoteView_t("MyView", numRanks * h_x.extent(0));
#else
RemoteView_t p = RemoteView_t("MyView", h_x.extent(0));
RemoteView_t p = RemoteView_t("MyView", numRanks, h_x.extent(0));
#endif
Kokkos::Timer timer;
int num_iters = cg_solve(y, A, x, p, max_iter, tolerance);
Expand Down Expand Up @@ -296,13 +296,14 @@ int main(int argc, char *argv[]) {

if (myRank == 0) {
#ifndef KOKKOS_REMOTE_SPACES_ENABLE_DEBUG
printf("%i, %i, %.2e, %.6lf, %.6lf, %.6lf\n", N, num_iters, total_flops,
time, GFlops, GBs);
printf("%i, %i, %i, %.2e, %.6lf, %.6lf, %.6lf\n", numRanks, N, num_iters,
total_flops, time, GFlops, GBs);
#else
printf(
"N, num_iters, total_flops, time, GFlops, BW(GB/sec), %i, %i, %.2e, "
"ranks, N, num_iters, total_flops, time, GFlops, BW(GB/sec), %i, %i, "
"%i, %.2e, "
"%.6lf, %.6lf, %.6lf\n",
N, num_iters, total_flops, time, GFlops, GBs);
numRanks, N, num_iters, total_flops, time, GFlops, GBs);
#endif
}
}
Expand Down
20 changes: 10 additions & 10 deletions applications/cgsolve/scripts/run_over_size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,22 @@ DEVICE_ID_4=3
HASH=`date|md5sum|head -c 5`
FILENAME="${BENCHMARK}_${HASH}"
echo $FILENAME
VARS0="--bind-to core --map-by socket"
VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/12.0.0/gcc/12.2.0/base/rantbbm/lib64/:$LD_LIBRARY_PATH"
#VARS0="--bind-to core --map-by socket"
VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/12.0.0/gcc/12.2.0/base/rantbbm/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=8589934592"


#One rank
FILENAME_ACTUAL=$FILENAME"_1x1x1.res"
echo "N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
for S in 10 20 40 80 160 320 640; do
echo "ranks,N,num_iters,total_flops,time,GFlops,BW(GB/sec)" | tee $FILENAME_ACTUAL
for S in 10 20 40 80 160 300; do
for reps in $(seq 1 3); do
CUDA_VISIBLE_DEVICES=$DEVICE_ID_1 mpirun -np 1 $VARS0 $VARS1 -host $HOST1 ./$BENCHMARK $S 10 | tee -a $FILENAME_ACTUAL
done
done
done

#Two ranks
FILENAME_ACTUAL=$FILENAME"_1x1x2.res"
echo "N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
echo "ranks,N,num_iters,total_flops,time,GFlops,BW(GB/sec)" | tee $FILENAME_ACTUAL
for S in 10 20 40 80 160 300; do
for reps in $(seq 1 3); do
CUDA_VISIBLE_DEVICES=$DEVICE_ID_1 mpirun -np 1 $VARS0 $VARS1 -host $HOST1 ./$BENCHMARK $S 10 : \
Expand All @@ -38,7 +38,7 @@ done

#Two ranks
FILENAME_ACTUAL=$FILENAME"_1x2x1.res"
echo "N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
echo "ranks,N,num_iters,total_flops,time,GFlops,BW(GB/sec)" | tee $FILENAME_ACTUAL
for S in 10 20 40 80 160 300; do
for reps in $(seq 1 3); do
CUDA_VISIBLE_DEVICES=$DEVICE_ID_1 mpirun -np 1 $VARS0 $VARS1 -host $HOST1 ./$BENCHMARK $S 10 : \
Expand All @@ -48,16 +48,16 @@ done

#Two ranks
FILENAME_ACTUAL=$FILENAME"_2x1x1.res"
echo "N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
echo "ranks,N,num_iters,total_flops,time,GFlops,BW(GB/sec)" | tee $FILENAME_ACTUAL
for S in 10 20 40 80 160 300; do
for reps in $(seq 1 3); do
CUDA_VISIBLE_DEVICES=$DEVICE_ID_1 mpirun -np 2 $VARS0 $VARS1 -host $HOST1,$HOST2 ./$BENCHMARK $S 10 | tee -a $FILENAME_ACTUAL
done
done

# #Four ranks
#Four ranks
# FILENAME_ACTUAL=$FILENAME"_4x1x1.res"
# echo "N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
# echo "ranks,N,num_iters,total_flops,time,GFlops,BW(GB/sec" | tee $FILENAME_ACTUAL
# for S in 10 20 40 80 160 300; do
# for reps in $(seq 1 3); do
# CUDA_VISIBLE_DEVICES=$DEVICE_ID_1 mpirun -np 4 $VARS0 $VARS1 -host $HOST1,$HOST2,$HOST3,$HOST4 ./$BENCHMARK $S 10 | tee -a $FILENAME_ACTUAL
Expand Down
40 changes: 21 additions & 19 deletions benchmarks/access_overhead/access_overhead_p2p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
#include <type_traits>
#include <string>

#define NUM_TEAMS 256
#define LDC_LEAGUE_SIZE 4096
#define LDC_TEAM_SIZE 1
//#define CHECK_FOR_CORRECTNESS

using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace;
Expand Down Expand Up @@ -127,15 +128,15 @@ struct Access<ViewType_t, typename std::enable_if_t<
};

KOKKOS_FUNCTION
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank; }
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; }

KOKKOS_FUNCTION
void operator()(const UpdateTag &, const size_t i) const { v(i) += v_tmp(i); }

KOKKOS_FUNCTION
void operator()(const CheckTag &, const size_t i) const {
assert(v(i) == typename ViewType_t::traits::value_type(iters * other_rank +
my_rank));
assert(v(i) == typename ViewType_t::traits::value_type(
iters * (other_rank + 1) + (my_rank + 1)));
}

// run copy benchmark
Expand Down Expand Up @@ -216,15 +217,15 @@ struct Access_CudaAware<
};

KOKKOS_FUNCTION
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank; }
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; }

KOKKOS_FUNCTION
void operator()(const UpdateTag &, const size_t i) const { v(i) += v_tmp(i); }

KOKKOS_FUNCTION
void operator()(const CheckTag &, const size_t i) const {
assert(v(i) == typename ViewType_t::traits::value_type(iters * other_rank +
my_rank));
assert(v(i) == typename ViewType_t::traits::value_type(
iters * (other_rank + 1) + (my_rank + 1)));
}

// run copy benchmark
Expand Down Expand Up @@ -292,7 +293,7 @@ struct Access<ViewType_t, typename std::enable_if_t<
};

KOKKOS_FUNCTION
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank; }
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; }

KOKKOS_FUNCTION
void operator()(const UpdateTag_get &, const size_t i) const {
Expand All @@ -306,8 +307,8 @@ struct Access<ViewType_t, typename std::enable_if_t<

KOKKOS_FUNCTION
void operator()(const CheckTag &, const size_t i) const {
assert(v(i) == typename ViewType_t::traits::value_type(iters * other_rank +
my_rank));
assert(v(i) == typename ViewType_t::traits::value_type(
iters * (other_rank + 1) + (my_rank + 1)));
}

// run copy benchmark
Expand Down Expand Up @@ -433,12 +434,12 @@ struct Access_LDC<
}

KOKKOS_FUNCTION
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank; }
void operator()(const InitTag &, const size_t i) const { v(i) = my_rank + 1; }

KOKKOS_FUNCTION
void operator()(const CheckTag &, const size_t i) const {
assert(v(i) == typename ViewType_t::traits::value_type(iters * other_rank +
my_rank));
assert(v(i) == typename ViewType_t::traits::value_type(
iters * (other_rank + 1) + (my_rank + 1)));
}

KOKKOS_FUNCTION
Expand Down Expand Up @@ -488,8 +489,9 @@ struct Access_LDC<
for (int i = 0; i < iters; i++) {
if (my_rank == 0) {
time_a = timer.seconds();
Kokkos::parallel_for("block_transfer",
team_policy_get_update_t(NUM_TEAMS, 1), *this);
Kokkos::parallel_for(
"block_transfer",
team_policy_get_update_t(LDC_LEAGUE_SIZE, LDC_TEAM_SIZE), *this);

Kokkos::fence();
#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG)
Expand All @@ -502,6 +504,7 @@ struct Access_LDC<
Kokkos::parallel_for(
"update", policy_update_t(local_range.first, local_range.second),
*this);
Kokkos::fence();
RemoteSpace_t().fence();
time_b = timer.seconds();
time += time_b - time_a;
Expand All @@ -514,8 +517,9 @@ struct Access_LDC<
for (int i = 0; i < iters; i++) {
if (my_rank == 0) {
time_a = timer.seconds();
Kokkos::parallel_for("block_transfer",
team_policy_put_update_t(NUM_TEAMS, 1), *this);
Kokkos::parallel_for(
"block_transfer",
team_policy_put_update_t(LDC_LEAGUE_SIZE, LDC_TEAM_SIZE), *this);
Kokkos::fence();
RemoteSpace_t().fence();
#if defined(KOKKOS_REMOTE_SPACES_ENABLE_DEBUG)
Expand Down Expand Up @@ -549,7 +553,6 @@ struct Access_LDC<
"access_overhead-check",
policy_check_t(local_range.first, local_range.second), *this);
Kokkos::fence();
RemoteSpace_t().fence();
}
} else {
// check on rank 1
Expand All @@ -558,7 +561,6 @@ struct Access_LDC<
"access_overhead-check",
policy_check_t(local_range.first, local_range.second), *this);
Kokkos::fence();
RemoteSpace_t().fence();
}
}
#endif
Expand Down
12 changes: 6 additions & 6 deletions benchmarks/access_overhead/scripts/run_over_size_p2p.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
BENCHMARK=$1
HOST1=$2
HOST2=$3
DEFAULT_SIZE=128
DEFAULT_SIZE=33554432 #128

#exports
export OMP_PROC_BIND=spread
Expand All @@ -11,13 +11,13 @@ export OMP_NUM_THREADS=32

ITERS=30

#XBus (Summit-like systems)
#NVLInk (=||=)
DEVICE_ID_1=0
DEVICE_ID_2=2
DEVICE_ID_2=1

#NVLInk (=||=)
#XBus (Summit-like systems)
#DEVICE_ID_1=0
#DEVICE_ID_2=1
#DEVICE_ID_2=2

#IB
#DEVICE_ID_1=0
Expand All @@ -28,7 +28,7 @@ FILENAME="${BENCHMARK}_${HASH}_p2p.res"
echo $FILENAME
echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME
VARS0="--bind-to core --map-by socket"
VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/12.0.0/gcc/12.2.0/base/rantbbm/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888"
VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/11.8.0/gcc/9.3.0/base/c3ajoqf/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888"

# Some more potential optimizations
#VARS1="" #-x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x \
Expand Down
21 changes: 21 additions & 0 deletions src/core/Kokkos_RemoteSpaces_Helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,22 @@
namespace Kokkos {
namespace Experimental {

template <class T>
struct Is_LayoutLeft {
enum : bool {
value = std::is_same<typename T::traits::array_layout,
Kokkos::LayoutLeft>::value
};
};

template <class T>
struct Is_LayoutRight {
enum : bool {
value = std::is_same<typename T::traits::array_layout,
Kokkos::LayoutRight>::value
};
};

template <class T>
struct Is_Partitioned_Layout {
enum : bool {
Expand Down Expand Up @@ -79,13 +95,16 @@ struct RemoteSpaces_View_Properties {
T R0_offset;
/* Num local elems in dim0 */
T R0_size;
/* Total offset incl. R0 */
T total_offset;
/* Com size and rank*/
int num_PEs;
int my_PE;

KOKKOS_FUNCTION
RemoteSpaces_View_Properties() {
using_local_indexing = false;
total_offset = 0;
R0_offset = 0;
R0_size = 0;
num_PEs = Kokkos::Experimental::get_num_pes();
Expand All @@ -95,6 +114,7 @@ struct RemoteSpaces_View_Properties {
KOKKOS_FUNCTION
RemoteSpaces_View_Properties(const RemoteSpaces_View_Properties &rhs) {
using_local_indexing = rhs.using_local_indexing;
total_offset = rhs.total_offset;
R0_offset = rhs.R0_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
Expand All @@ -104,6 +124,7 @@ struct RemoteSpaces_View_Properties {
KOKKOS_FUNCTION RemoteSpaces_View_Properties &operator=(
const RemoteSpaces_View_Properties &rhs) {
using_local_indexing = rhs.using_local_indexing;
total_offset = rhs.total_offset;
R0_offset = rhs.R0_offset;
R0_size = rhs.R0_size;
num_PEs = rhs.num_PEs;
Expand Down
Loading

0 comments on commit 4ce2925

Please sign in to comment.