This commit is contained in:
Saeed Maleki 2021-03-24 23:12:32 +00:00
Родитель 0d51c43749
Коммит b63029c21b
6 изменённых файлов: 76 добавлений и 34 удалений

Просмотреть файл

@ -29,6 +29,7 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
// SCKL active blocks allocation
NCCLCHECK(ncclCudaHostCalloc(&channel->scklActiveThreadBlocks, (SCKL_MAX_NUM_THREAD_BLOCKS-1)*NCCL_MAX_OPS));
NCCLCHECK(ncclCudaHostCalloc(&channel->scklNumBlocksPerChannel, 1));
return ncclSuccess;
}
@ -38,6 +39,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
NCCLCHECK(ncclCudaHostFree(channel->scklActiveThreadBlocks));
NCCLCHECK(ncclCudaHostFree(channel->scklNumBlocksPerChannel));
// Free Ring index to rank tables
free(channel->ring.userRanks);

Просмотреть файл

@ -14,28 +14,57 @@ class ncclFunction<ncclFuncAllToAll, ALGO, PROTO, FUNC, T, UNROLL> {
__device__ void run(struct ncclWorkElem* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->coll.bid;
const int bid = blockIdx.x;
const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
// struct scklAlgoState* sGraph = &channel->sGraph;
const int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
const int channelId = bid/scklNumBlocksPerChannel;
struct ncclChannel* channel = comm->channels+channelId;
// relative bid to a channel
int rbid = bid % scklNumBlocksPerChannel;
struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
struct scklThreadBlock* sckltb = &scklAlgo->scklTB[rbid];
const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
const int nranks = comm->nRanks;
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
const ssize_t size = args->coll.count;
const int nChunks = scklAlgo->nChunks;
// assume that size is divisible by nchunks
const ssize_t sizePerChunk = size/nChunks;
// Compute pointers
// return;
const T * __restrict__ thisInput = (const T*)args->sendbuff;
T * __restrict__ thisOutput = (T*)args->recvbuff;
// ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 3, 3, 1, FUNC>
// prims(tid, nthreads, sGraph->recv, sGraph->send, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
// if (tid == 0 && bid == 0){
// printf("connected to %d %d %d\n", sGraph->send[0], sGraph->send[1], sGraph->send[2]);
// }
// int testSize = min(chunkSize, (int)size/nChannels/nranks);
// prims.directSend(thisInput, 0, testSize);
// prims.directRecv(thisOutput, 0, testSize);
return;
int myRank = channel->ring.devUserRanks[0];
int m1 = -1;
int recvPeer = (sckltb->type == SCKL_RECV) ? sckltb->peer : m1;
int sendPeer = (sckltb->type == SCKL_SEND) ? sckltb->peer : m1;
ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &recvPeer, &sendPeer, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
for (ssize_t gridOffset = 0; gridOffset < sizePerChunk; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(sizePerChunk-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + channelId*realChunkSize;
ssize_t offset;
int nelem = min(realChunkSize, sizePerChunk-chunkOffset);
for (int i = 0; i < sckltb->nsteps; i++){
offset = chunkOffset + sckltb->transfers[i] * sizePerChunk;
if (sckltb->type == SCKL_SEND){
if (tid == 0)
printf("SSS sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
prims.directSend(thisInput + offset, offset, nelem);
if (tid == 0)
printf("EEE sending myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
} else if (sckltb->type == SCKL_SEND) {
if (tid == 0)
printf("SSS receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
prims.directRecv(thisOutput + offset, offset, nelem);
if (tid == 0)
printf("EEE receiving myRank %d recvPeer = %d sendPeer = %d bid = %d nSteps = %d nelem = %d\n", myRank, recvPeer, sendPeer, bid, sckltb->nsteps, nelem);
}
}
}
}
};

Просмотреть файл

@ -9,6 +9,7 @@
#include "collectives.h"
#include "devcomm.h"
#include <stdio.h>
#if __CUDA_ARCH__ >= 800
@ -82,11 +83,12 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
struct ncclDevComm* comm = first.comm;
// in SCKL, if there are different number of threadblocks per channel, this needs to change.
int scklNumBlocksPerChannel = comm->channels[0].scklNumBlocksPerChannel;
int scklNumBlocksPerChannel = *comm->channels[0].scklNumBlocksPerChannel;
struct ncclChannel* channel = comm->channels + (bid / scklNumBlocksPerChannel);
struct ncclWorkElem* w = NULL;
uint16_t index = first.index;
int myRank = channel->ring.devUserRanks[0];
/* To optimize for latency, (only) the first operation is passed as argument.*/
if (bid < scklNumBlocksPerChannel && first.funcIndex != FUNC_INDEX_P2P) w = &first;
while (1) {
@ -94,6 +96,9 @@ __device__ void ncclKernel(struct ncclWorkElem first) {
w = shmem.localWork.elems;
load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, bid, scklNumBlocksPerChannel);
}
if (tid == 0){
printf("bid = %d index = %d\n", bid, index);
}
if (tid < w->nThreads) {
if (w->funcIndex == FINDEX) {
f.run(w);

Просмотреть файл

@ -104,7 +104,7 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
e->active = 1;
// SCKL replicates active for other thread blocks
for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + opIndex] = 1;
}
e->index = opIndex;
@ -119,9 +119,6 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
for (int c=0; c<comm->p2pnChannels; c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
// SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
// multiply the number of threadblocks by scklNumBlocksPerChannel
params->gridDim.x *= comm->channels[0].scklNumBlocksPerChannel;
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<params->gridDim.x; c++) {
@ -136,11 +133,16 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
}
int channelTailIndex = ((channel->workFifoTail-1)%NCCL_MAX_OPS);
channel->workFifo[channelTailIndex].elems[0].active = 2;
for (int i = 0; i < channel->scklNumBlocksPerChannel-1; i++){
for (int i = 0; i < *channel->scklNumBlocksPerChannel-1; i++){
channel->scklActiveThreadBlocks[i*NCCL_MAX_OPS + channelTailIndex] = 2;
}
}
// This is the first time SCKL disassociates bids and channels
// SCKL for now we are assuming scklNumBlocksPerChannel is the same for each channel.
// multiply the number of threadblocks by scklNumBlocksPerChannel
params->gridDim.x *= *comm->channels[0].scklNumBlocksPerChannel;
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclChannel* c0 = comm->channels;
@ -485,7 +487,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
*channel->scklNumBlocksPerChannel = (info->algorithm == NCCL_ALGO_SCKL) ? info->comm->scklAlgo.nBlocks : 1;
// Proxy
proxyArgs.channel = channel;
// Adjust pattern for CollNet based on channel index
@ -625,7 +627,7 @@ ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
// sckl does not generate p2p kernels.
channel->scklNumBlocksPerChannel = 1;
*channel->scklNumBlocksPerChannel = 1;
return ncclSuccess;
}

Просмотреть файл

@ -117,21 +117,24 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Skipping on SCKL algorithms here since SCKL tune the algorithm in the synthesizer.
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
if (a == NCCL_ALGO_SCKL){
// Only sckl alltoall functions is implemented
if (coll == ncclFuncAllToAll){
// SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
// Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (coll == ncclFuncAllToAll || a == NCCL_ALGO_SCKL) {
// Only sckl alltoall functions with simple protocol is implemented
// SCKL algorithm is dynamic and busBw/latency can only be determined by the input XML algorithm. An analysis will be added later.
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (coll == ncclFuncAllToAll && a == NCCL_ALGO_SCKL && p == NCCL_PROTO_SIMPLE){
// Setting the bandwidth and latency values to 1.0 (some arbitrary value) so that they don't get skipped by ncclTopoGetAlgoTime
comm->bandwidths[coll][a][p] = 1.0;
comm->latencies[coll][a][p] = 1.0;
} else {
comm->bandwidths[coll][a][p] = 0.0; // This will make sure that sckl is not selected for any other scenario
comm->latencies[coll][a][p] = 0.0;
}
} else {
continue;
}
continue;
}
if (coll != ncclFuncAllReduce && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float speed = nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed;
@ -197,7 +200,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (comm->collNetSupport == 0) {
algoEnable[NCCL_ALGO_COLLNET] = 0;
// If user has hard set NCCL_ALGO=COLLNET, ignore it
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0) {
if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && algoEnable[NCCL_ALGO_SCKL] == 0) {
algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = algoEnable[NCCL_ALGO_SCKL] = 1;
if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
}

Просмотреть файл

@ -18,8 +18,8 @@ extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 4 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET 2
#define NCCL_ALGO_SCKL 3
#define NCCL_ALGO_SCKL 2
#define NCCL_ALGO_COLLNET 3
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@ -222,7 +222,8 @@ struct ncclChannel {
uint64_t workFifoTail; // Only used by CPU
// in SCKL algorithms, ncclWorkElem.active element from workFifo is replicated for for all other thread blocks
uint16_t* scklActiveThreadBlocks;
int scklNumBlocksPerChannel;
// this will be allocated on the host and mapped on the device
int* scklNumBlocksPerChannel;
};
int data[0x80];
};