NVIDIA · sjeaugey · Mar 27, 2020 · Apr 21, 2020
diff --git a/makefiles/version.mk b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 6
-NCCL_PATCH   := 4
-NCCL_SUFFIX  :=
+NCCL_MINOR   := 7
+NCCL_PATCH   := 0
+NCCL_SUFFIX  := p4
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
                 misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
-                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
+                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
                 graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
 
 ##### lib files

diff --git a/src/channel.cc b/src/channel.cc
@@ -1,29 +1,16 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "channel.h"
 #include "param.h"
-#include "graph.h"
-
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
   struct ncclChannel* channel = comm->channels+channelid;
   channel->id = channelid;
 
-  // Setup intermediate buffering
-  int buffSize = ncclParamBuffsize();
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-  channel->buffSize = buffSize != -2 ? buffSize :
-	  cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
-
   // Ring index to user rank table.
   NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
   NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
@@ -37,7 +24,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
   }
 
   // Per-channel operation list.
-  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
   return ncclSuccess;
 }
 

diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
 BUILDDIR ?= $(abspath ../../../build)
 OBJDIR := $(BUILDDIR)/obj/collectives/device
 
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu
 
 LIBSRCFILES += functions.cu
 

diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,26 +11,27 @@
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = args->nThreads-WARP_SIZE;
-  const int bid = args->bid;
+  const int nthreads = args->coll.nThreads-WARP_SIZE;
+  const int bid = args->coll.bid;
+  const int nChannels = args->coll.nChannels;
   struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
-  const ssize_t size = args->N;
-  const int nranks = comm->nRanks;
-  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
   const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
-  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+  const int nranks = comm->nRanks;
+  const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+  const ssize_t size = args->coll.count;
 
   // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->ThisInput;
-  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  const T * __restrict__ thisInput = (const T*)args->sendbuff;
+  T * __restrict__ thisOutput = (T*)args->recvbuff;
 
   ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
     ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
     ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
@@ -75,27 +76,27 @@ __device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int bid = args->bid;
-  const int nthreads = args->nThreads;
+  const int nthreads = args->coll.nThreads;
+  const int bid = args->coll.bid;
+  const int nChannels = args->coll.nChannels;
   struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
-
-  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
-  const ssize_t size = args->N;
-  //const int rank = comm->rank;
+  const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+  ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
   const int nranks = comm->nRanks;
-  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nChannels*chunkSize;
+  const ssize_t loopSize = nChannels*chunkSize;
+  const ssize_t size = args->coll.count;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
 
   // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->ThisInput;
-  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  const T * __restrict__ thisInput = (const T*)args->sendbuff;
+  T * __restrict__ thisOutput = (T*)args->recvbuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
-      chunkSize = args->lastChunkSize;
+      chunkSize = args->coll.lastChunkSize;
     }
     ssize_t chunkOffset = gridOffset + bid*chunkSize;
 
@@ -140,29 +141,28 @@ __device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int bid = args->bid;
-  const int nthreads = args->nThreads;
+  const int nthreads = args->coll.nThreads;
+  const int bid = args->coll.bid;
+  const int nChannels = args->coll.nChannels;
   struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
-
-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
-  const ssize_t size = args->N;
-  //const int rank = comm->rank;
-  const int nranks = comm->nRanks;
-  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
   // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
   const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+  const int nranks = comm->nRanks;
+  const ssize_t loopSize = nChannels*chunkSize;
+  const ssize_t size = args->coll.count;
 
-  const ssize_t loopSize = args->nChannels*chunkSize;
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
 
   // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->ThisInput;
-  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  const T * __restrict__ thisInput = (const T*)args->sendbuff;
+  T * __restrict__ thisOutput = (T*)args->recvbuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+    chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
 
     ssize_t chunkOffset = gridOffset + bid*chunkSize;