зеркало из https://github.com/microsoft/msccl.git
added some guards for corner cases
This commit is contained in:
Родитель
cae8c88a87
Коммит
ee8e4c9d12
|
@ -24,7 +24,7 @@ class ncclFunction<ncclFuncAllToAll, ALGO, PROTO, FUNC, T, UNROLL> {
|
|||
const int nranks = comm->nRanks;
|
||||
const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
|
||||
const ssize_t size = args->coll.count;
|
||||
|
||||
return;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->sendbuff;
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
|
|
@ -649,9 +649,17 @@ ncclResult_t scklGetTopoFromXMLAndSetChannels(struct ncclComm* comm) {
|
|||
// SCKL generates the same scklGraph for all channels for now. This will change in the future
|
||||
for (int c=0; c<comm->nChannels; c++){
|
||||
if (isRecv) {
|
||||
comm->channels[c].sGraph.recv[comm->channels[c].sGraph.nRecvPeers++] = peerId;
|
||||
if (comm->channels[c].sGraph.nRecvPeers < SCKL_MAX_NUM_CONN){
|
||||
comm->channels[c].sGraph.recv[comm->channels[c].sGraph.nRecvPeers++] = peerId;
|
||||
} else {
|
||||
WARN("Too many recv connections for device %d channel %d -- connection to %d is ignored. This may cause deadlock in initialization.", rank, c, peerId);
|
||||
}
|
||||
} else if (isSend){
|
||||
comm->channels[c].sGraph.send[comm->channels[c].sGraph.nSendPeers++] = peerId;
|
||||
if (comm->channels[c].sGraph.nSendPeers < SCKL_MAX_NUM_CONN){
|
||||
comm->channels[c].sGraph.send[comm->channels[c].sGraph.nSendPeers++] = peerId;
|
||||
} else {
|
||||
WARN("Too many recv connections for device %d channel %d -- connection to %d is ignored. This may cause deadlock in initialization.", rank, c, peerId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче