deadlocks

2021-03-24 23:12:32 +00:00 · 2021-03-24 23:12:32 +00:00 · b63029c21b
--- a/src/channel.cc
+++ b/src/channel.cc
@ -29,6 +29,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {

  // SCKL active blocks allocation
  NCCLCHECK(ncclCudaHostCalloc(&channel->scklActiveThreadBlocks, (SCKL_MAX_NUM_THREAD_BLOCKS-1)*NCCL_MAX_OPS));
+  NCCLCHECK(ncclCudaHostCalloc(&channel->scklNumBlocksPerChannel, 1));
  return ncclSuccess;
 }

@ -38,6 +39,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  NCCLCHECK(ncclCudaHostFree(channel->workFifo));

  NCCLCHECK(ncclCudaHostFree(channel->scklActiveThreadBlocks));
+  NCCLCHECK(ncclCudaHostFree(channel->scklNumBlocksPerChannel));

  // Free Ring index to rank tables
  free(channel->ring.userRanks);
--- a/src/collectives/device/all_to_all.h
+++ b/src/collectives/device/all_to_all.h
@ -14,28 +14,57 @@ class ncclFunction<ncclFuncAllToAll, ALGO, PROTO, FUNC, T, UNROLL> {
    __device__ void run(struct ncclWorkElem* args) {
      const int tid = threadIdx.x;
      const int nthreads = args->nThreads-WARP_SIZE;
-      const int bid = args->coll.bid;
+      const int bid = blockIdx.x;
      const int nChannels = args->coll.nChannels;
      struct ncclDevComm* comm = args->comm;
-      struct ncclChannel* channel = comm->channels+blockIdx.x;
-      // struct scklAlgoState* sGraph = &channel->sGraph;
+      const int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
+      const int channelId = bid/scklNumBlocksPerChannel;
+      struct ncclChannel* channel = comm->channels+channelId;
+      // relative bid to a channel
+      int rbid = bid % scklNumBlocksPerChannel;
+      struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
+      struct scklThreadBlock* sckltb = &scklAlgo->scklTB[rbid];
      const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
      const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
      const int nranks = comm->nRanks;
      const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
      const ssize_t size = args->coll.count;
+      const int nChunks = scklAlgo->nChunks;
+      // assume that size is divisible by nchunks
+      const ssize_t sizePerChunk = size/nChunks;
      // Compute pointers
-      // return;
      const T * __restrict__ thisInput = (const T*)args->sendbuff;
      T * __restrict__ thisOutput = (T*)args->recvbuff;
-      // ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 3, 3, 1, FUNC>
-      //   prims(tid, nthreads, sGraph->recv, sGraph->send, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
-      // if (tid == 0 && bid == 0){
-      //   printf("connected to %d %d %d\n", sGraph->send[0], sGraph->send[1], sGraph->send[2]);
-      // }
-      // int testSize = min(chunkSize, (int)size/nChannels/nranks);
-      // prims.directSend(thisInput, 0, testSize);
-      // prims.directRecv(thisOutput, 0, testSize);
-      return;
+      int myRank = channel->ring.devUserRanks[0];
+      int m1 = -1;
+      int recvPeer = (sckltb->type == SCKL_RECV) ? sckltb->peer : m1;
+      int sendPeer = (sckltb->type == SCKL_SEND) ? sckltb->peer : m1;
+      ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 1, 1, FUNC>
+        prims(tid, nthreads, &recvPeer, &sendPeer, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+
+      for (ssize_t gridOffset = 0; gridOffset < sizePerChunk; gridOffset += loopSize) {
+        int realChunkSize = min(chunkSize, DIVUP(sizePerChunk-gridOffset,nChannels));
+        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+        ssize_t chunkOffset = gridOffset + channelId*realChunkSize;
+        ssize_t offset;
+        int nelem = min(realChunkSize, sizePerChunk-chunkOffset);
+        for (int i = 0; i < sckltb->nsteps; i++){
+          offset = chunkOffset + sckltb->transfers[i] * sizePerChunk;
+          if (sckltb->type == SCKL_SEND){
+            if (tid == 0)
+              printf("SSS sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
+            prims.directSend(thisInput + offset, offset, nelem);
+            if (tid == 0)
+              printf("EEE sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
+          } else if (sckltb->type == SCKL_SEND) {
+            if (tid == 0)
+              printf("SSS receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
+            prims.directRecv(thisOutput + offset, offset, nelem);
+            if (tid == 0)
+              printf("EEE receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
+          }
+        }
+      }
    }
 };
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -9,6 +9,7 @@

 #include "collectives.h"
 #include "devcomm.h"
+#include <stdio.h>


 #if __CUDA_ARCH__ >= 800
@ -82,11 +83,12 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {

  struct ncclDevComm* comm = first.comm;
  // in SCKL, if there are different number of threadblocks per channel, this needs to change.
-  int scklNumBlocksPerChannel = comm->channels[0].scklNumBlocksPerChannel;
+  int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
  struct ncclChannel* channel = comm->channels + (bid / scklNumBlocksPerChannel);
  struct ncclWorkElem* w = NULL;
  uint16_t index = first.index;

+  int myRank = channel->ring.devUserRanks[0];
  /* To optimize for latency, (only) the first operation is passed as argument.*/
  if (bid < scklNumBlocksPerChannel && first.funcIndex != FUNC_INDEX_P2P) w = &first;
  while (1) {
@ -94,6 +96,9 @@ __device__ void ncclKernel(struct ncclWorkElem first)  {
      w = shmem.localWork.elems;
      load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, bid, scklNumBlocksPerChannel);
    }
+    if (tid == 0){
+      printf("bid = %d index = %d\n", bid, index);
+    }
    if (tid < w->nThreads) {
      if (w->funcIndex == FINDEX) {
        f.run(w);
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@ -104,7 +104,7 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
  if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
  e->active = 1;
  // SCKL replicates active for other thread blocks
-  for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
+  for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
    channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + opIndex] = 1;
  }  
  e->index = opIndex;
@ -119,9 +119,6 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
  for (int c=0; c<comm->p2pnChannels; c++) {
    if (comm->channels[c].workCount) params->gridDim.x = c+1;
  }
-  // SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
-  // multiply the number of threadblocks by scklNumBlocksPerChannel
-  params->gridDim.x *= comm->channels[0].scklNumBlocksPerChannel;

  // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
  for (int c=0; c<params->gridDim.x; c++) {
@ -136,11 +133,16 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
    }
    int channelTailIndex = ((channel->workFifoTail-1)%NCCL_MAX_OPS);
    channel->workFifo[channelTailIndex].elems[0].active = 2;
-    for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
+    for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
      channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + channelTailIndex] = 2;
    }    
  }

+  // This is the first time SCKL disassociates bids and channels
+  // SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
+  // multiply the number of threadblocks by scklNumBlocksPerChannel
+  params->gridDim.x *= *comm->channels[0].scklNumBlocksPerChannel;
+
  // Find the first operation, choose the kernel accordingly and pass it
  // as the first argument.
  struct ncclChannel* c0 = comm->channels;
@ -485,7 +487,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
  for (int bid=0; bid<nChannels*nSubChannels; bid++) {
    int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
    struct ncclChannel* channel = info->comm->channels+channelId;
-    channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
+    *channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
    // Proxy
    proxyArgs.channel = channel;
    // Adjust pattern for CollNet based on channel index
@ -625,7 +627,7 @@ ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
  info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
  // sckl does not generate p2p kernels.
-  channel->scklNumBlocksPerChannel = 1;
+  *channel->scklNumBlocksPerChannel = 1;

  return ncclSuccess;
 }
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@ -117,21 +117,24 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    // Skipping on SCKL algorithms here since SCKL tune the algorithm in the synthesizer.

    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
-      if (a == NCCL_ALGO_SCKL){
-        // Only sckl alltoall functions is implemented
-        if (coll == ncclFuncAllToAll){
-          // SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
-          // Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
-          for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      if (coll == ncclFuncAllToAll || a == NCCL_ALGO_SCKL) {
+        // Only sckl alltoall functions with simple protocol is implemented
+        // SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          if (coll == ncclFuncAllToAll && a == NCCL_ALGO_SCKL && p == NCCL_PROTO_SIMPLE){
+            // Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
            comm->bandwidths[coll][a][p] = 1.0;
            comm->latencies[coll][a][p] = 1.0;
+          } else {
+            comm->bandwidths[coll][a][p] = 0.0; // This will make sure that sckl is not selected for any other scenario
+            comm->latencies[coll][a][p] = 0.0;
          }
-        } else {
-          continue; 
        }
+        continue;
      }

+      if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
+
      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
        float busBw = graphs[a]->nChannels * speed;
@ -197,7 +200,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  if (comm->collNetSupport == 0) {
    algoEnable[NCCL_ALGO_COLLNET] = 0;
    // If user has hard set NCCL_ALGO=COLLNET, ignore it
-    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
+    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_SCKL] == 0) {
      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = algoEnable[NCCL_ALGO_SCKL] = 1;
      if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
    }
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@ -18,8 +18,8 @@ extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 #define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET 2
-#define NCCL_ALGO_SCKL 3
+#define NCCL_ALGO_SCKL 2
+#define NCCL_ALGO_COLLNET 3
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];

 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@ -222,7 +222,8 @@ struct ncclChannel {
      uint64_t workFifoTail; // Only used by CPU
      // in SCKL algorithms, ncclWorkElem.active element from workFifo is replicated for for all other thread blocks
      uint16_t* scklActiveThreadBlocks;
-      int scklNumBlocksPerChannel;
+      // this will be allocated on the host and mapped on the device
+      int* scklNumBlocksPerChannel;
    };
    int data[0x80];
  };