This commit is contained in:
Saeed Maleki 2021-03-23 06:48:26 +00:00
Родитель 62169be637
Коммит 08d3ea9909
9 изменённых файлов: 113 добавлений и 73 удалений

Просмотреть файл

@ -607,66 +607,72 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
return ncclSuccess;
}
ncclResult_t scklGetTopoFromXMLAndSetChannels(struct ncclComm* comm) {
ncclResult_t scklGetAlgoFromXMLAndSetComm(struct ncclComm* comm) {
char* str = getenv("SCKL_XML_FILE");
if (str){
INFO(NCCL_ENV, "SCKL_XML_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(scklTopoGetXmlGraphFromFile(str, xml));
NCCLCHECK(scklGetXmlAlgoFromFile(str, xml));
int rank = comm->rank;
for (int c=0; c<comm->nChannels; c++){
comm->channels[c].sGraph.nRecvPeers = 0;
comm->channels[c].sGraph.nSendPeers = 0;
}
struct scklAglorithm* scklAlgo = &comm->scklAlgo;
// zeroing out all entries.
memset(scklAlgo, 0, sizeof(struct scklAlgorithm));
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
NCCLCHECK(xmlFindTag(xml, "algo", &topNode));
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "gpu") == 0){
int id;
NCCLCHECK(xmlGetAttrInt(node, "id", &id));
if (id == rank){
for (int p=0; p<node->nSubs; p++) {
struct ncclXmlNode* typeOfComm = node->subs[p];
if (strcmp(typeOfComm->name, "conn") == 0){
scklAlgo->nBlocks = 0;
for (int t=0; t<node->nSubs; t++) {
struct ncclXmlNode* threadblockNode = node->subs[t];
if (strcmp(threadblockNode->name, "threadblock") == 0){
int rbid, peer;
const char* type;
NCCLCHECK(xmlGetAttrStr(typeOfComm, "type", &type));
NCCLCHECK(xmlGetAttrInt(threadblockNode, "rbid", &rbid));
NCCLCHECK(xmlGetAttrInt(threadblockNode, "peer", &peer));
NCCLCHECK(xmlGetAttrStr(threadblockNode, "type", &type));
if (rbid >= SCKL_MAX_NUM_THREAD_BLOCKS){
WARN("Too many thread blocks are requested. Max thread blocks: %d, requested: %d", SCKL_MAX_NUM_THREAD_BLOCKS, rbid+1);
return ncclInternalError;
}
if (rbid < 0){
WARN("rbid must be positive. rbid: %d", rbid);
return ncclInternalError;
}
scklAlgo->nBlocks = std::max(comm->scklAlgo.nBlocks, rbid+1);
struct scklThreadBlock* sTB = scklAlgo->scklTB[rbid];
sTB->nsteps = 0;
sTB->peer = peer;
if (strcmp(type, "send") == 0){
sTB->type = SCKL_SEND;
} else if (strcmp(type, "recv") == 0) {
sTB->type = SCKL_RECV;
} else {
WARN("type of transfer is not supported: %s", type);
return ncclInternalError;
}
bool isRecv = false;
bool isSend = false;
if (strcmp(type, "recv") == 0){
isRecv = true;
} else if (strcmp(type, "send") == 0){
isSend = true;
}
for (int p=0; p<typeOfComm->nSubs; p++) {
struct ncclXmlNode* peer = typeOfComm->subs[p];
int peerId;
NCCLCHECK(xmlGetAttrInt(peer, "id", &peerId));
// SCKL generates the same scklAlgoState for all channels for now. This will change in the future
for (int c=0; c<comm->nChannels; c++){
if (isRecv) {
if (comm->channels[c].sGraph.nRecvPeers < SCKL_MAX_NUM_CONN){
int index = comm->channels[c].sGraph.nRecvPeers;
comm->channels[c].sGraph.recv[index] = peerId;
// comm->channels[c].sGraph.recv[index].nChunks = 1;
comm->channels[c].sGraph.nRecvPeers++;
} else {
WARN("Too many recv connections for device %d channel %d -- connection to %d is ignored. This may cause deadlock in initialization.", rank, c, peerId);
}
} else if (isSend){
if (comm->channels[c].sGraph.nSendPeers < SCKL_MAX_NUM_CONN){
int index = comm->channels[c].sGraph.nSendPeers;
comm->channels[c].sGraph.send[index] = peerId;
// comm->channels[c].sGraph.send[index].nChunks = 1;
comm->channels[c].sGraph.nSendPeers++;
} else {
WARN("Too many recv connections for device %d channel %d -- connection to %d is ignored. This may cause deadlock in initialization.", rank, c, peerId);
for (int st=0; st<threadblockNode->nSubs; st++) {
struct ncclXmlNode* stepNode = threadblockNode->subs[st];
if (strcmp(stepNode->name, "step") == 0){
int s, chunkId;
NCCLCHECK(xmlGetAttrInt(stepNode, "s", &s));
NCCLCHECK(xmlGetAttrInt(stepNode, "chunkId", &chunkId));
if (s >= SCKL_MAX_NUM_STEPS){
WARN("Too many steps are requested. Max number of steps: %d, requested: %d", SCKL_MAX_NUM_STEPS, s+1);
return ncclInternalError;
}
if (s < 0){
WARN("step must be positive: step %d", s);
return ncclInternalError;
}
sTB->nsteps = std::max(sTB->nsteps, s+1);
sTB->transfers[s] = chunkId;
}
}
}

Просмотреть файл

@ -806,43 +806,47 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
return ncclSuccess;
}
ncclResult_t scklTopoXmlPeerLoad(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
int id;
NCCLCHECK(xmlGetAttrInt(head, "id", &id));
ncclResult_t scklAlgoXmlStep(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
int s, chunkId;
NCCLCHECK(xmlGetAttrInt(head, "s", &s));
NCCLCHECK(xmlGetAttrInt(head, "chunkId", &chunkId));
struct xmlHandler handlers[] = { };
NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
return ncclSuccess;
}
ncclResult_t scklTopoXmlConnLoad(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
ncclResult_t scklAlgoXmlthreadblock(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
int rbid, peer;
const char* type;
NCCLCHECK(xmlGetAttrInt(head, "rbid", &id));
NCCLCHECK(xmlGetAttrInt(head, "peer", &peer));
NCCLCHECK(xmlGetAttrStr(head, "type", &type));
struct xmlHandler handlers[] = { { "peer", scklTopoXmlPeerLoad } };
struct xmlHandler handlers[] = { { "step", scklAlgoXmlStep } };
NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
return ncclSuccess;
}
ncclResult_t scklTopoXmlGraphLoad(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
ncclResult_t scklAlgoXmlGpu(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
int id;
NCCLCHECK(xmlGetAttrInt(head, "id", &id));
struct xmlHandler handlers[] = { { "conn", scklTopoXmlConnLoad } };
struct xmlHandler handlers[] = { { "threadblock", scklAlgoXmlthreadblock } };
NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
return ncclSuccess;
}
ncclResult_t scklTopoXmlSystemLoad(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
struct xmlHandler handlers[] = { { "gpu", scklTopoXmlGraphLoad } };
ncclResult_t scklAlgoXmlLoad(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
struct xmlHandler handlers[] = { { "gpu", scklAlgoXmlGpu } };
NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
return ncclSuccess;
}
ncclResult_t scklTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) {
ncclResult_t scklGetXmlAlgoFromFile(const char* xmlGraphFile, struct ncclXml* xml) {
FILE* file = fopen(xmlGraphFile, "r");
if (file == NULL) {
WARN("Could not open XML SCKL graph file %s : %s", xmlGraphFile, strerror(errno));
return ncclSystemError;
}
struct xmlHandler handlers[] = { { "system", scklTopoXmlSystemLoad } };
struct xmlHandler handlers[] = { { "algo", scklAlgoXmlLoad } };
xml->maxIndex = 0;
NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
fclose(file);

Просмотреть файл

@ -43,7 +43,7 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml);
#define NCCL_GRAPH_XML_VERSION 1
ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml);
ncclResult_t scklTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml);
ncclResult_t scklGetXmlAlgoFromFile(const char* xmlGraphFile, struct ncclXml* xml);
/* Auto-detect functions */
ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);

Просмотреть файл

@ -58,6 +58,7 @@ struct ncclRecvMem {
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct scklAlgorithm scklAlgo;
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;

Просмотреть файл

@ -117,20 +117,25 @@ struct ncclRing {
int* devUserRanks;
};
#define SCKL_MAX_NUM_CONN 16
#define SCKL_MAX_NUM_STEPS 16
#define SCKL_MAX_NUM_THREAD_BLOCKS 16
// struct scklConn {
// int peer;
// int nChunks;
// };
#define SCKL_SEND 0
#define SCKL_RECV 1
struct scklAlgoState {
int nRecvPeers;
int nSendPeers;
int recv[SCKL_MAX_NUM_CONN];
int send[SCKL_MAX_NUM_CONN];
// struct scklConn recv[SCKL_MAX_NUM_CONN];
// struct scklConn send[SCKL_MAX_NUM_CONN];
struct scklThreadBlock {
uint8_t peer;
uint8_t type; // follow SCKL_SEND and SCKL_RECV macros
uint8_t nsteps;
// step is used to index into this array. transfers[step] is the chunkId to transfer.
uint16_t transfers[SCKL_MAX_NUM_STEPS];
};
// gpuId is the one that is in comm->rank
struct scklAlgorithm {
int nBlocks;
// rbid is used as an index into this array
struct scklThreadBlock scklTB[SCKL_MAX_NUM_THREAD_BLOCKS];
};
#define NCCL_MAX_TREE_ARITY 3
@ -193,7 +198,6 @@ struct ncclChannel {
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collTree;
struct scklAlgoState sGraph;
int id;

Просмотреть файл

@ -35,8 +35,8 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int ne
// Set CPU affinity
ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
// SCKL setup peers
ncclResult_t scklGetTopoFromXMLAndSetChannels(struct ncclComm* comm);
// SCKL get alirthm from XML file and set the communicator
ncclResult_t scklGetAlgoFromXMLAndSetComm(struct ncclComm* comm);
#define NCCL_TOPO_CPU_ARCH_X86 1
#define NCCL_TOPO_CPU_ARCH_POWER 2

Просмотреть файл

@ -55,6 +55,7 @@ struct ncclTransport {
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
ncclResult_t scklTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
#endif

Просмотреть файл

@ -824,16 +824,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
INFO(NCCL_INIT, "Connected all trees");
// NetSharedBuffers needs to be set for this to work across nodes.
NCCLCHECK(scklGetTopoFromXMLAndSetChannels(comm));
NCCLCHECK(scklGetAlgoFromXMLAndSetComm(comm));
// Connect SCKL graph
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, channel->sGraph.nRecvPeers, channel->sGraph.recv, channel->sGraph.nSendPeers, channel->sGraph.send), ret, affinity_restore);
NCCLCHECKGOTO(scklTransportP2pConnect(comm, channel), ret, affinity_restore);
}
// It appears that graph is not really needed for P2pSetup. The only place that actually uses it is in ncclTopoGetNetDev which has a bypass for when it is set to NULL.
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL), ret, affinity_restore);
INFO(NCCL_INIT, "Connected SCKL graph");
INFO(NCCL_INIT, "Connected SCKL algorithm");
// Check if we can setup CollNet
if (comm->nNodes > 1 &&

Просмотреть файл

@ -51,6 +51,30 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
return ncclSuccess;
}
// SCKL needs to traverse the algorithm to find the peers
ncclResult_t scklTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel) {
uint32_t mask = 1 << channel->id;
struct scklAlgorithm* scklAlgo = &comm->scklAlgo;
int nrecv = 0;
int nsend = 0;
for (int i=0; i<scklAlgo->nBlocks; i++){
int peer = scklAlgo->scklTB[i].peer;
int type = scklAlgo->scklTB[i].type; // 0 for send, 1 for recv
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank) continue;
if (type == SCKL_SEND){
if (channel->peers[peer].send.connected) continue;
comm->connectSend[peer] |= mask;
nsend++;
} else if (type == SCKL_RECV) {
if (channel->peers[peer].recv.connected) continue;
comm->connectRecv[peer] |= mask;
nrecv++;
}
}
TRACE(NCCL_INIT, "sckl nsend %d nrecv %d", nsend, nrecv);
return ncclSuccess;
}
void dumpData(struct ncclConnect* data, int ndata) {
for (int n=0; n<ndata; n++) {
printf("[%d] ", n);