зеркало из https://github.com/microsoft/msccl.git
deadlocks
This commit is contained in:
Родитель
0d51c43749
Коммит
b63029c21b
|
@ -29,6 +29,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
|||
|
||||
// SCKL active blocks allocation
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->scklActiveThreadBlocks, (SCKL_MAX_NUM_THREAD_BLOCKS-1)*NCCL_MAX_OPS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->scklNumBlocksPerChannel, 1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -38,6 +39,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
|||
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
|
||||
|
||||
NCCLCHECK(ncclCudaHostFree(channel->scklActiveThreadBlocks));
|
||||
NCCLCHECK(ncclCudaHostFree(channel->scklNumBlocksPerChannel));
|
||||
|
||||
// Free Ring index to rank tables
|
||||
free(channel->ring.userRanks);
|
||||
|
|
|
@ -14,28 +14,57 @@ class ncclFunction<ncclFuncAllToAll, ALGO, PROTO, FUNC, T, UNROLL> {
|
|||
__device__ void run(struct ncclWorkElem* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads-WARP_SIZE;
|
||||
const int bid = args->coll.bid;
|
||||
const int bid = blockIdx.x;
|
||||
const int nChannels = args->coll.nChannels;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
// struct scklAlgoState* sGraph = &channel->sGraph;
|
||||
const int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
|
||||
const int channelId = bid/scklNumBlocksPerChannel;
|
||||
struct ncclChannel* channel = comm->channels+channelId;
|
||||
// relative bid to a channel
|
||||
int rbid = bid % scklNumBlocksPerChannel;
|
||||
struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
|
||||
struct scklThreadBlock* sckltb = &scklAlgo->scklTB[rbid];
|
||||
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
|
||||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
const int nChunks = scklAlgo->nChunks;
|
||||
// assume that size is divisible by nchunks
|
||||
const ssize_t sizePerChunk = size/nChunks;
|
||||
// Compute pointers
|
||||
// return;
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
// ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 3, 3, 1, FUNC>
|
||||
// prims(tid, nthreads, sGraph->recv, sGraph->send, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
// if (tid == 0 && bid == 0){
|
||||
// printf("connected to %d %d %d\n", sGraph->send[0], sGraph->send[1], sGraph->send[2]);
|
||||
// }
|
||||
// int testSize = min(chunkSize, (int)size/nChannels/nranks);
|
||||
// prims.directSend(thisInput, 0, testSize);
|
||||
// prims.directRecv(thisOutput, 0, testSize);
|
||||
return;
|
||||
int myRank = channel->ring.devUserRanks[0];
|
||||
int m1 = -1;
|
||||
int recvPeer = (sckltb->type == SCKL_RECV) ? sckltb->peer : m1;
|
||||
int sendPeer = (sckltb->type == SCKL_SEND) ? sckltb->peer : m1;
|
||||
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &recvPeer, &sendPeer, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
|
||||
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < sizePerChunk; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(sizePerChunk-gridOffset,nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + channelId*realChunkSize;
|
||||
ssize_t offset;
|
||||
int nelem = min(realChunkSize, sizePerChunk-chunkOffset);
|
||||
for (int i = 0; i < sckltb->nsteps; i++){
|
||||
offset = chunkOffset + sckltb->transfers[i] * sizePerChunk;
|
||||
if (sckltb->type == SCKL_SEND){
|
||||
if (tid == 0)
|
||||
printf("SSS sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
|
||||
prims.directSend(thisInput + offset, offset, nelem);
|
||||
if (tid == 0)
|
||||
printf("EEE sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
|
||||
} else if (sckltb->type == SCKL_SEND) {
|
||||
if (tid == 0)
|
||||
printf("SSS receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
|
||||
prims.directRecv(thisOutput + offset, offset, nelem);
|
||||
if (tid == 0)
|
||||
printf("EEE receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
|
@ -82,11 +83,12 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
|
|||
|
||||
struct ncclDevComm* comm = first.comm;
|
||||
// in SCKL, if there are different number of threadblocks per channel, this needs to change.
|
||||
int scklNumBlocksPerChannel = comm->channels[0].scklNumBlocksPerChannel;
|
||||
int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
|
||||
struct ncclChannel* channel = comm->channels + (bid / scklNumBlocksPerChannel);
|
||||
struct ncclWorkElem* w = NULL;
|
||||
uint16_t index = first.index;
|
||||
|
||||
int myRank = channel->ring.devUserRanks[0];
|
||||
/* To optimize for latency, (only) the first operation is passed as argument.*/
|
||||
if (bid < scklNumBlocksPerChannel && first.funcIndex != FUNC_INDEX_P2P) w = &first;
|
||||
while (1) {
|
||||
|
@ -94,6 +96,9 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
|
|||
w = shmem.localWork.elems;
|
||||
load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, bid, scklNumBlocksPerChannel);
|
||||
}
|
||||
if (tid == 0){
|
||||
printf("bid = %d index = %d\n", bid, index);
|
||||
}
|
||||
if (tid < w->nThreads) {
|
||||
if (w->funcIndex == FINDEX) {
|
||||
f.run(w);
|
||||
|
|
|
@ -104,7 +104,7 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
|
|||
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
|
||||
e->active = 1;
|
||||
// SCKL replicates active for other thread blocks
|
||||
for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
|
||||
for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
|
||||
channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + opIndex] = 1;
|
||||
}
|
||||
e->index = opIndex;
|
||||
|
@ -119,9 +119,6 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
|
|||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
||||
}
|
||||
// SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
|
||||
// multiply the number of threadblocks by scklNumBlocksPerChannel
|
||||
params->gridDim.x *= comm->channels[0].scklNumBlocksPerChannel;
|
||||
|
||||
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
|
||||
for (int c=0; c<params->gridDim.x; c++) {
|
||||
|
@ -136,11 +133,16 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
|
|||
}
|
||||
int channelTailIndex = ((channel->workFifoTail-1)%NCCL_MAX_OPS);
|
||||
channel->workFifo[channelTailIndex].elems[0].active = 2;
|
||||
for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
|
||||
for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
|
||||
channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + channelTailIndex] = 2;
|
||||
}
|
||||
}
|
||||
|
||||
// This is the first time SCKL disassociates bids and channels
|
||||
// SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
|
||||
// multiply the number of threadblocks by scklNumBlocksPerChannel
|
||||
params->gridDim.x *= *comm->channels[0].scklNumBlocksPerChannel;
|
||||
|
||||
// Find the first operation, choose the kernel accordingly and pass it
|
||||
// as the first argument.
|
||||
struct ncclChannel* c0 = comm->channels;
|
||||
|
@ -485,7 +487,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
|
|||
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
|
||||
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
|
||||
struct ncclChannel* channel = info->comm->channels+channelId;
|
||||
channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
|
||||
*channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
|
||||
// Proxy
|
||||
proxyArgs.channel = channel;
|
||||
// Adjust pattern for CollNet based on channel index
|
||||
|
@ -625,7 +627,7 @@ ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
|
|||
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
|
||||
// sckl does not generate p2p kernels.
|
||||
channel->scklNumBlocksPerChannel = 1;
|
||||
*channel->scklNumBlocksPerChannel = 1;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
|
|
@ -117,21 +117,24 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||
// Skipping on SCKL algorithms here since SCKL tune the algorithm in the synthesizer.
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (a == NCCL_ALGO_SCKL){
|
||||
// Only sckl alltoall functions is implemented
|
||||
if (coll == ncclFuncAllToAll){
|
||||
// SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
|
||||
// Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (coll == ncclFuncAllToAll || a == NCCL_ALGO_SCKL) {
|
||||
// Only sckl alltoall functions with simple protocol is implemented
|
||||
// SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (coll == ncclFuncAllToAll && a == NCCL_ALGO_SCKL && p == NCCL_PROTO_SIMPLE){
|
||||
// Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
|
||||
comm->bandwidths[coll][a][p] = 1.0;
|
||||
comm->latencies[coll][a][p] = 1.0;
|
||||
} else {
|
||||
comm->bandwidths[coll][a][p] = 0.0; // This will make sure that sckl is not selected for any other scenario
|
||||
comm->latencies[coll][a][p] = 0.0;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
|
||||
float busBw = graphs[a]->nChannels * speed;
|
||||
|
@ -197,7 +200,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
|||
if (comm->collNetSupport == 0) {
|
||||
algoEnable[NCCL_ALGO_COLLNET] = 0;
|
||||
// If user has hard set NCCL_ALGO=COLLNET, ignore it
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
|
||||
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_SCKL] == 0) {
|
||||
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = algoEnable[NCCL_ALGO_SCKL] = 1;
|
||||
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
|
||||
}
|
||||
|
|
|
@ -18,8 +18,8 @@ extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
|
|||
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
#define NCCL_ALGO_COLLNET 2
|
||||
#define NCCL_ALGO_SCKL 3
|
||||
#define NCCL_ALGO_SCKL 2
|
||||
#define NCCL_ALGO_COLLNET 3
|
||||
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
|
@ -222,7 +222,8 @@ struct ncclChannel {
|
|||
uint64_t workFifoTail; // Only used by CPU
|
||||
// in SCKL algorithms, ncclWorkElem.active element from workFifo is replicated for for all other thread blocks
|
||||
uint16_t* scklActiveThreadBlocks;
|
||||
int scklNumBlocksPerChannel;
|
||||
// this will be allocated on the host and mapped on the device
|
||||
int* scklNumBlocksPerChannel;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче