all protocols work

2021-04-08 23:37:17 +00:00 · 2021-04-08 23:37:17 +00:00 · d83e777ac3
--- a/src/collectives/all_to_all.cc
+++ b/src/collectives/all_to_all.cc
@ -14,6 +14,6 @@ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t sendcount
  NVTX3_FUNC_RANGE_IN(nccl_domain);
  struct ncclInfo info = { ncclFuncAllToAll, "AllToAll",
    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
-    ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+    SCKL_CHUNKSTEPS, SCKL_SLICESTEPS };
  return ncclEnqueueCheck(&info);
 }
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@ -132,7 +132,6 @@ class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UN
    }
 };

-#include "prims_ll128.h"
 template<class FUNC, typename T, int UNROLL>
 class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
  public:
@ -207,11 +206,29 @@ class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL>
    __device__ void run(struct ncclWorkElem* args) {}
 };

-template<int PROTO, class FUNC, typename T, int UNROLL>
-class ncclFunction<ncclFuncAllGather, NCCL_ALGO_SCKL, PROTO, FUNC, T, UNROLL> {
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_SCKL, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
  public:
    __device__ void run(struct ncclWorkElem* args) {
-      SCKLFunctions<PROTO, FUNC, T, UNROLL> scklfunc;
+      SCKLFunctionSimple<FUNC, T, UNROLL> scklfunc;
      scklfunc.run(args);
    }
 };
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_SCKL, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      SCKLFunctionLL128<FUNC, T, UNROLL> scklfunc;
+      scklfunc.run(args);
+    }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_SCKL, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      SCKLFunctionLL<FUNC, T, UNROLL> scklfunc;
+      scklfunc.run(args);
+    }
+};
--- a/src/collectives/device/all_to_all.h
+++ b/src/collectives/device/all_to_all.h
@ -9,11 +9,29 @@
 #include "collectives.h"
 #include "sckl_interpreter.h"

-template<int ALGO, int PROTO, class FUNC, typename T, int UNROLL>
-class ncclFunction<ncclFuncAllToAll, ALGO, PROTO, FUNC, T, UNROLL> {
+template<int ALGO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllToAll, ALGO, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
  public:
    __device__ void run(struct ncclWorkElem* args) {
-      SCKLFunctions<PROTO, FUNC, T, UNROLL> scklfunc;
+      SCKLFunctionSimple<FUNC, T, UNROLL> scklfunc;
+      scklfunc.run(args);
+    }
+};
+
+template<int ALGO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllToAll, ALGO, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      SCKLFunctionLL128<FUNC, T, UNROLL> scklfunc;
+      scklfunc.run(args);
+    }
+};
+
+template<int ALGO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllToAll, ALGO, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      SCKLFunctionLL<FUNC, T, UNROLL> scklfunc;
      scklfunc.run(args);
    }
 };
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -106,7 +106,6 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {
          w->index += NCCL_MAX_OPS;
        }
      }
-      
      if (w->funcIndex == FINDEX) {
        f.run(w);
      } else {
--- a/src/collectives/device/sckl_interpreter.h
+++ b/src/collectives/device/sckl_interpreter.h
@ -14,8 +14,8 @@
 #define COMPUTE_FLAG(__WORKINDEX__,__GRIDOFFSET_ITER__,__STEP__) \
   SCKL_MAX_ITER*SCKL_MAX_NUM_STEPS*__WORKINDEX__ + (__GRIDOFFSET_ITER__ * SCKL_MAX_NUM_STEPS + __STEP__)

-template<int PROTO, class FUNC, typename T, int UNROLL>
-class SCKLFunctions {
+template<class FUNC, typename T, int UNROLL>
+class SCKLFunctionSimple {
  public:
    __device__ void run(struct ncclWorkElem* args) {
      struct ncclDevComm* comm = args->comm;
@ -24,7 +24,6 @@ class SCKLFunctions {
      const int nthreads = args->nThreads-WARP_SIZE;
      const int sync_tid = args->nThreads-1; // last thread is most likely not doing anthing and used for sckl cross thread synchronization
      const int bid = blockIdx.x;
-    //   const int nChannels = args->coll.nChannels;
      const int scklNBlocks = scklAlgo->nBlocks;
      const int rscklbid = bid % scklNBlocks; // bid within a sckl algo
      const int scklIndex = bid / scklNBlocks; // which instance of sckl algo
@ -33,10 +32,9 @@ class SCKLFunctions {
      const int channelId = scklIndex * scklAlgo->nChannels + scklTB->channelId;
      struct ncclChannel* channel = comm->channels+channelId;
      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-      const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
+      const int chunkSize = stepSize * SCKL_CHUNKSTEPS;
      const int nranks = comm->nRanks;
      const int nchunksPerLoopPerRank = scklAlgo->nchunksPerLoop/nranks;
-    //   const int totalNChunksPerLoopPerRank = nScklInstnaces*nchunksPerLoopPerRank;
      const ssize_t loopSize = (ssize_t)chunkSize*nScklInstnaces;
      const ssize_t size = args->coll.count;
      const ssize_t sizePerScklChunk = size/nchunksPerLoopPerRank;
@ -47,11 +45,10 @@ class SCKLFunctions {
      // Compute pointers
      T * thisInput = (T*)args->sendbuff;
      T * thisOutput = (T*)args->recvbuff;
-    //   int myRank = channel->ring.devUserRanks[0];
      int recvPeer = scklTB->recvpeer;
      int sendPeer = scklTB->sendpeer;

-      ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 1, 1, FUNC>
+      ncclPrimitives<UNROLL, SCKL_CHUNKSTEPS/SCKL_SLICESTEPS, SCKL_SLICESTEPS, T, 1, 1, 1, FUNC>
        prims(tid, nthreads, &recvPeer, &sendPeer, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
      for (ssize_t gridOffset = 0, iter = 0; gridOffset < sizePerScklChunk; gridOffset += loopSize, iter++) {
        int realChunkSize = min(chunkSize, DIVUP(sizePerScklChunk-gridOffset,nScklInstnaces));
@ -101,3 +98,169 @@ class SCKLFunctions {
      }
    }
 };
+
+#include "prims_ll128.h"
+template<class FUNC, typename T, int UNROLL>
+class SCKLFunctionLL128 {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      struct ncclDevComm* comm = args->comm;
+      struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int sync_tid = args->nThreads-1; // last thread is most likely not doing anthing and used for sckl cross thread synchronization
+      const int bid = blockIdx.x;
+      const int scklNBlocks = scklAlgo->nBlocks;
+      const int rscklbid = bid % scklNBlocks; // bid within a sckl algo
+      const int scklIndex = bid / scklNBlocks; // which instance of sckl algo
+      const int nScklInstnaces = gridDim.x / scklAlgo->nBlocks; // number of sckl aglos
+      struct scklThreadBlock* scklTB = &scklAlgo->scklTB[rscklbid];
+      const int channelId = scklIndex * scklAlgo->nChannels + scklTB->channelId;
+      struct ncclChannel* channel = comm->channels+channelId;
+      const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+      ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+      const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+      const int nranks = comm->nRanks;
+      const int nchunksPerLoopPerRank = scklAlgo->nchunksPerLoop/nranks;
+      const ssize_t loopSize = (ssize_t)chunkSize*nScklInstnaces;
+      const ssize_t size = args->coll.count;
+      const ssize_t sizePerScklChunk = size/nchunksPerLoopPerRank;
+      // sckl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags
+      // this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions.
+      const int workIndex = args->index+1;
+      volatile struct scklFlag* scklFlags = comm->scklFlags;
+      // Compute pointers
+      T * thisInput = (T*)args->sendbuff;
+      T * thisOutput = (T*)args->recvbuff;
+      int recvPeer = scklTB->recvpeer;
+      int sendPeer = scklTB->sendpeer;
+
+      ncclLL128Primitives<T, FUNC, 1, 1> prims(tid, nthreads, &recvPeer, &sendPeer, stepSize, channel, comm);
+      for (ssize_t gridOffset = 0, iter = 0; gridOffset < sizePerScklChunk; gridOffset += loopSize, iter++) {
+        chunkSize = min(chunkSize, DIVUP(sizePerScklChunk-gridOffset,nScklInstnaces*minChunkSize)*minChunkSize);
+        ssize_t chunkOffset = gridOffset + scklIndex*chunkSize;
+        ssize_t srcoffset, dstoffset;
+        T* srcPointer, * dstPointer;
+        int nelem = min(chunkSize, sizePerScklChunk-chunkOffset);
+        for (int i = 0; i < scklTB->nsteps; i++){
+          struct scklTransfer* sckltran = &scklTB->transfers[i];
+          if (sckltran->type == SCKL_NO_OP) continue;
+          // first wait if there is a dependence
+          int8_t dependentBid = sckltran->dependentBid + scklIndex * scklNBlocks;
+          int8_t dependentStep = sckltran->dependentStep;
+          if (sckltran->dependentBid >= 0){
+              if (tid == sync_tid){
+              uint64_t goalFlag = COMPUTE_FLAG(workIndex, iter, dependentStep);
+              while ((scklFlags + dependentBid)->flag < goalFlag){};
+              }
+              __syncthreads();
+          }
+
+          srcPointer = (sckltran->srcbuffer == SCKL_INPUT_BUFFER) ? thisInput : thisOutput;
+          srcoffset = chunkOffset + (ssize_t) sckltran->srcoffset * sizePerScklChunk;
+          dstPointer = (sckltran->dstbuffer == SCKL_INPUT_BUFFER) ? thisInput : thisOutput;
+          dstoffset = chunkOffset + (ssize_t) sckltran->dstoffset * sizePerScklChunk;
+          switch (sckltran->type) {
+            case SCKL_SEND:
+              prims.send(srcPointer + srcoffset, nelem);
+              break;
+            case SCKL_RECV:
+              prims.recv(dstPointer + dstoffset, nelem);
+              break;
+            case SCKL_RECV_COPY_SEND:
+              prims.recvCopySend(dstPointer + dstoffset, nelem);
+              break;
+            default:
+              return;
+          }
+          if (tid == sync_tid){
+            __threadfence();
+            uint64_t curFlag = COMPUTE_FLAG(workIndex, iter, i);
+            scklFlags[bid].flag = curFlag;
+          }
+        }
+      }
+    }
+};
+
+
+template<class FUNC, typename T, int UNROLL>
+class SCKLFunctionLL {
+  public:
+    __device__ void run(struct ncclWorkElem* args) {
+      struct ncclDevComm* comm = args->comm;
+      struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
+      const int tid = threadIdx.x;
+      const int nthreads = args->nThreads;
+      const int sync_tid = args->nThreads-1; // last thread is most likely not doing anthing and used for sckl cross thread synchronization
+      const int bid = blockIdx.x;
+      const int scklNBlocks = scklAlgo->nBlocks;
+      const int rscklbid = bid % scklNBlocks; // bid within a sckl algo
+      const int scklIndex = bid / scklNBlocks; // which instance of sckl algo
+      const int nScklInstnaces = gridDim.x / scklAlgo->nBlocks; // number of sckl aglos
+      struct scklThreadBlock* scklTB = &scklAlgo->scklTB[rscklbid];
+      const int channelId = scklIndex * scklAlgo->nChannels + scklTB->channelId;
+      struct ncclChannel* channel = comm->channels+channelId;
+      const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+      ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+      const int nranks = comm->nRanks;
+      const int nchunksPerLoopPerRank = scklAlgo->nchunksPerLoop/nranks;
+      const ssize_t loopSize = (ssize_t)chunkSize*nScklInstnaces;
+      const ssize_t size = args->coll.count;
+      const ssize_t sizePerScklChunk = size/nchunksPerLoopPerRank;
+      // sckl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags
+      // this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions.
+      const int workIndex = args->index+1;
+      volatile struct scklFlag* scklFlags = comm->scklFlags;
+      // Compute pointers
+      T * thisInput = (T*)args->sendbuff;
+      T * thisOutput = (T*)args->recvbuff;
+      int recvPeer = scklTB->recvpeer;
+      int sendPeer = scklTB->sendpeer;
+
+      ncclLLPrimitives<T, FUNC, 1, 1> prims(tid, nthreads, &recvPeer, &sendPeer, stepLines, channel, comm);
+      for (ssize_t gridOffset = 0, iter = 0; gridOffset < sizePerScklChunk; gridOffset += loopSize, iter++) {
+        ssize_t chunkOffset = gridOffset + scklIndex*chunkSize;
+        ssize_t srcoffset, dstoffset;
+        T* srcPointer, * dstPointer;
+        int nelem = min(chunkSize, sizePerScklChunk-chunkOffset);
+        for (int i = 0; i < scklTB->nsteps; i++){
+          struct scklTransfer* sckltran = &scklTB->transfers[i];
+          if (sckltran->type == SCKL_NO_OP) continue;
+          // first wait if there is a dependence
+          int8_t dependentBid = sckltran->dependentBid + scklIndex * scklNBlocks;
+          int8_t dependentStep = sckltran->dependentStep;
+          if (sckltran->dependentBid >= 0){
+              if (tid == sync_tid){
+              uint64_t goalFlag = COMPUTE_FLAG(workIndex, iter, dependentStep);
+              while ((scklFlags + dependentBid)->flag < goalFlag){};
+              }
+              __syncthreads();
+          }
+
+          srcPointer = (sckltran->srcbuffer == SCKL_INPUT_BUFFER) ? thisInput : thisOutput;
+          srcoffset = chunkOffset + (ssize_t) sckltran->srcoffset * sizePerScklChunk;
+          dstPointer = (sckltran->dstbuffer == SCKL_INPUT_BUFFER) ? thisInput : thisOutput;
+          dstoffset = chunkOffset + (ssize_t) sckltran->dstoffset * sizePerScklChunk;
+          switch (sckltran->type) {
+            case SCKL_SEND:
+              prims.send(srcPointer + srcoffset, nelem);
+              break;
+            case SCKL_RECV:
+              prims.recv(dstPointer + dstoffset, nelem);
+              break;
+            case SCKL_RECV_COPY_SEND:
+              prims.recvCopySend(dstPointer + dstoffset, nelem);
+              break;
+            default:
+              return;
+          }
+          if (tid == sync_tid){
+            __threadfence();
+            uint64_t curFlag = COMPUTE_FLAG(workIndex, iter, i);
+            scklFlags[bid].flag = curFlag;
+          }
+        }
+      }
+    }
+};
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@ -121,7 +121,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        // Only sckl alltoall functions with simple protocol is implemented
        // SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          if ((coll == ncclFuncAllToAll || coll == ncclFuncAllGather) && a == NCCL_ALGO_SCKL && p == NCCL_PROTO_SIMPLE){
+          if ((coll == ncclFuncAllToAll || coll == ncclFuncAllGather) && a == NCCL_ALGO_SCKL){
            // Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
            comm->bandwidths[coll][a][p] = 1.0;
            comm->latencies[coll][a][p] = 1.0;
@ -215,7 +215,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
    // Only disable algo for Allreduce since others only have one
-    if (c == ncclFuncAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if ((c == ncclFuncAllReduce || c == ncclFuncAllGather) && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
  }

  if (comm->rank == 0) {
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@ -74,8 +74,8 @@ DECL_ALL
 #define BROADCAST_CHUNKSTEPS 1
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
-#define ALLTOALL_SLICESTEPS (NCCL_STEPS/4)
-#define ALLTOALL_CHUNKSTEPS (NCCL_STEPS/2)
+#define SCKL_SLICESTEPS (NCCL_STEPS/4)
+#define SCKL_CHUNKSTEPS (NCCL_STEPS/2)
 #define SENDRECV_SLICEFACTOR 4

 #endif
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@ -118,7 +118,7 @@ struct ncclRing {
 };

 #define SCKL_MAX_NUM_STEPS 16
-#define SCKL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL 128
+#define SCKL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL 8

 #define SCKL_INPUT_BUFFER 0
 #define SCKL_OUTPUT_BUFFER 1
@ -199,7 +199,7 @@ struct ncclWorkElem {
  uint16_t funcIndex;
  uint16_t index;
  // in SCKL algorithms, ncclWorkElem.active element from workFifo is replicated for for all other thread blocks
-  uint16_t active[SCKL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
+  uint8_t active[SCKL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
  uint8_t isScklAlgorithm; // right now, 0 indicates not a sckl algorithm and 1 indicates it is. In future versions, this will be the index into arrays of scklAlgorithms.
  uint8_t nActives; // if it is a sckl algorithm, it must be set to associated channel number of thread blocks. if not a sckl algorithm, it is 1.

@ -221,13 +221,13 @@ struct ncclWorkElem {
      int32_t delta;
      uint16_t nThreads;
    } p2p;
-    uint64_t align[28];
+    uint64_t align[3];
  };
 };
 struct ncclWork {
  struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
 };
-static_assert(sizeof(struct ncclWorkElem) == (0x80*sizeof(int)), "ncclWorkElem must have a pow2 size");
+static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");

 struct ncclChannel {
  union {