Added detection of IBM/Power NVLink bridge device.
    Add NUMA support to PCI distance calculations.
    Added NCCL_IGNORE_CPU_AFFINITY env var.
    Fix memory leaks; GithubIssue#180
    Compiler warning fix; GithubIssue#178
    Replace non-standard variable length arrays. GithubIssue#171
    Fix Tree+Shared Memory crash. GithubPR#185
    Fix LL cleanup hang during long running DL jobs.
    Fix NCCL_RINGS environment variable handling.
    Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191
    Improve bootstrap socket connection reliability at scale.
    Fix hostname hashing issue. GithubIssue#187
    Code cleanup to rename all non device files from *.cu to *.cc
This commit is contained in:
David Addison 2019-03-14 19:39:20 -07:00
Родитель 14e0cf644b
Коммит f40ce73e89
81 изменённых файлов: 892 добавлений и 692 удалений

Просмотреть файл

@ -1,5 +1,5 @@
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@ -15,6 +15,7 @@ PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@ -43,7 +44,8 @@ endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-sign-compare
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
CXXFLAGS += -I $(CUDA_INC)
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
else
.SILENT:

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 4
NCCL_PATCH := 2
NCCL_PATCH := 6
NCCL_SUFFIX :=
PKG_REVISION := 1

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@ -9,10 +9,10 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
##### lib files
LIBNAME := libnccl.so
@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : %.cu
$(OBJDIR)/%.o : %.cc
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@ -107,7 +107,7 @@ install : lib
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}
// Free the peer structures.
CUDACHECK(cudaFree(channel->devPeers));
free(channel->peers);
return ncclSuccess;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -8,7 +8,7 @@
#define NCCL_DEVICE_COMMON_H_
#include "../collectives.h"
#include "core.h"
#include "devcomm.h"
#include "nccl.h"
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
\
struct ncclComm* comm = firstColl.args.comm; \
struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
if (bid == 0) { \

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -7,7 +7,7 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
#include "core.h"
#include "devcomm.h"
#include <cstdio>
#include <cstdint>

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "collectives.h"
#include "common.h"

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#

Просмотреть файл

@ -50,7 +50,7 @@ class ncclPrimitives {
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
struct ncclComm* comm;
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@ -239,7 +239,7 @@ class ncclPrimitives {
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
__syncthreads();
@ -329,14 +329,14 @@ class ncclLLPrimitives {
uint64_t sendConnHead;
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclComm* comm;
struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
@ -393,7 +393,10 @@ class ncclLLPrimitives {
sendConnHead = *waitPtr;
if (checkAbort(sendConn[i]->opCountRem)) break;
}
if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
if (fifoPtr) {
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
fifoPtr[sendStep[i]%NCCL_STEPS] = size;
}
}
}
@ -402,7 +405,12 @@ class ncclLLPrimitives {
if (tid == i) *postPtr = recvStep[i];
}
inline __device__ void postSend(int i) {
inline __device__ void postSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
@ -443,9 +451,10 @@ class ncclLLPrimitives {
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
int offset = tid;
// Do multiples of 64 bits
#pragma unroll 2
for (int offset=tid; offset<npack; offset+=nthreads) {
for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
@ -471,7 +480,7 @@ class ncclLLPrimitives {
}
exitIfAbortLocalBarrier();
FOR_RECV(postRecv);
FOR_SEND(postSend);
FOR_SEND(postSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
@ -514,32 +523,9 @@ class ncclLLPrimitives {
}
}
__device__ __forceinline__ void llSendCleaning(int i) {
if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
/* Reset all flags */
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
for (int s=0; s<NCCL_STEPS; s++) {
waitSend(i, 0);
for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
sendPtr(i)[o].i4 = resetLine.i4;
}
}
if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
}
}
__device__ __forceinline__ void llRecvCleaning(int i) {
if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
recvStep[i] += NCCL_STEPS;
if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
}
}
public:
__device__ __forceinline__
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
@ -577,8 +563,6 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ ~ncclLLPrimitives() {
for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
}
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
// Set active = 2 for the last operation
for (int r=0; r<params->gridDim.x; r++) {
@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
// Compute thresholds and limits that users can override
int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
// First compute nThreads
@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
if (info->comm->userStreamSet == false) {
info->comm->userStream = info->stream;
info->comm->userStreamSet = true;

51
src/include/alloc.h Normal file
Просмотреть файл

@ -0,0 +1,51 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ALLOC_H_
#define NCCL_ALLOC_H_
#include "nccl.h"
#include "checks.h"
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
return ncclSuccess;
}
#endif

15
src/include/argcheck.h Normal file
Просмотреть файл

@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ARGCHECK_H_
#define NCCL_ARGCHECK_H_
#include "core.h"
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
ncclResult_t ArgsCheck(struct ncclInfo* info);
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,10 +1,73 @@
/*************************************************************************
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#ifndef NCCL_CHECKS_H_
#define NCCL_CHECKS_H_
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
ncclResult_t ArgsCheck(struct ncclInfo* info);
#include "debug.h"
// Check CUDA calls
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
#endif

127
src/include/comm.h Normal file
Просмотреть файл

@ -0,0 +1,127 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
uint64_t opCount;
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
char buff[1]; // Actually larger than that
};
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
// Channels for collectives
int nChannels;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// Tree algorithm threshold
ssize_t treeThreshold;
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
// Error reported by GPU
volatile ncclDevError_t* fatalDevError;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
// Device side of the communicator
struct ncclDevComm *devComm;
// Host copy of the devComm (to free CUDA allocs)
struct ncclDevComm hostDevComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
struct ncclProxyState proxyState;
};
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -7,385 +7,20 @@
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
#include <pthread.h>
#include <algorithm>
#include "nccl.h"
#include "transport.h"
#include "debug.h"
#include "checks.h"
#include "alloc.h"
#include "transport.h"
#include "devcomm.h"
#include "comm.h"
#include "info.h"
#include "argcheck.h"
#include <cstdio>
#include <algorithm> // std::min/std::max
#include <unistd.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define MAXCHANNELS 16
#define MAXTHREADS 256
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
#define NCCL_LL_MIN_NTHREADS 64
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
typedef enum {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown
} ncclPattern_t;
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
ncclDevSuspectedMismatch
} ncclDevError_t;
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclColl_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
int root;
ncclComm_t comm;
cudaStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclPattern_t pattern;
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
};
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCountLoc; // opCount of local rank
uint64_t *opCountRem; // opCount of remote rank
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
uint64_t step; // Keep where we are
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
};
struct ncclConnector {
int connected;
struct ncclProxyArgs *proxyAppend;
struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
struct ncclComm *comm;
};
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
#define NUM_LINES_PER_THREAD 8
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#define NCCL_LL_CLEAN_FREQ 0x10000000
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
uint64_t opCount;
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
};
char pad4[MEM_ALIGN];
};
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
char buff[1]; // Actually larger than that
};
struct ncclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
};
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
int up;
int down[NCCL_MAX_TREE_ARITY];
};
struct ncclPeer {
struct ncclConnector send;
struct ncclConnector recv;
};
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree tree;
int id;
int nthreads;
int buffSize;
// Communication structures
struct ncclPeer* peers;
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclComm* comm;
uint64_t opCount;
// local and remote input, output, and buffer
const void * ThisInput;
void * ThisOutput;
// general parameters
size_t N;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
uint16_t nThreads;
int lastChunkSize;
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
// Channels for collectives
int nChannels;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
// Tree algorithm threshold
ssize_t treeThreshold;
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
// Error reported by GPU
volatile ncclDevError_t* fatalDevError;
// On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
// On device: this pointer has been obtained from cudaHostGetDevicePointer()
volatile uint32_t *abortFlag;
// Device copy of the communicator
struct ncclComm *devComm;
// Intra-process sync
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
// Global proxy thread
pthread_t proxyThread;
struct ncclProxyState proxyState;
};
// Check CUDA calls
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
#define NCCLCHECKGOTO(call, res, label) do { \
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
return ncclSuccess;
}
#endif // end include guard

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -24,7 +24,7 @@ extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen);
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
@ -108,7 +108,7 @@ static inline void initDebug() {
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024);
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid

194
src/include/devcomm.h Normal file
Просмотреть файл

@ -0,0 +1,194 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEVICE_H_
#define NCCL_DEVICE_H_
#include "nccl.h"
#include <stdint.h>
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
#define ROUNDUP(x, y) \
(DIVUP((x), (y))*(y))
#define ALIGN_SIZE(size, align) \
size = ((size + (align) - 1) / (align)) * (align);
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define MAXTHREADS 256
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
#define NUM_LINES_PER_THREAD 8
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#ifdef DEBUG_LL
#define NCCL_LL_CLEAN_MASK 0x00000ff8
#define NCCL_LL_FLAG_MAX 0x00001000
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
#else
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCountLoc; // opCount of local rank
uint64_t *opCountRem; // opCount of remote rank
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
uint64_t step; // Keep where we are
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
};
struct ncclConnector {
int connected;
struct ncclProxyArgs *proxyAppend;
struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
struct ncclComm *comm;
};
struct ncclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
};
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
int up;
int down[NCCL_MAX_TREE_ARITY];
};
struct ncclPeer {
struct ncclConnector send;
struct ncclConnector recv;
};
struct ncclDevComm;
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
struct CollectiveArgs {
struct ncclDevComm* comm;
uint64_t opCount;
// local and remote input, output, and buffer
const void * ThisInput;
void * ThisOutput;
// general parameters
size_t N;
uint32_t root;
uint8_t bid;
uint8_t nChannels;
uint16_t nThreads;
int lastChunkSize;
};
struct ncclColl {
union {
struct {
struct CollectiveArgs args;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
};
int data[0x10];
};
};
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree tree;
int id;
int nthreads;
int buffSize;
// Communication structures
struct ncclPeer* peers;
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#define MAXCHANNELS 16
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
ncclDevSuspectedMismatch
} ncclDevError_t;
struct ncclDevComm {
int rank;
int nRanks;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile ncclDevError_t *fatalDevError;
// Channels, device side
struct ncclChannel* channels;
};
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -10,6 +10,12 @@
#include "core.h"
#include "group.h"
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MIN_NTHREADS 64
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);

Просмотреть файл

@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

45
src/include/info.h Normal file
Просмотреть файл

@ -0,0 +1,45 @@
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INFO_H_
#define NCCL_INFO_H_
#include "nccl.h"
typedef enum {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclColl_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
int root;
ncclComm_t comm;
cudaStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclPattern_t pattern;
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
};
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -80,12 +80,13 @@ typedef struct {
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -18,6 +18,7 @@
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
if ((fd = open(rPath, O_RDONLY)) == -1) {
// Could not find device. It might be because we're in a VM and
// we don't see the whole machine. This is handled silently so
// we don't want to print an INFO error.
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
return ncclSystemError;
}
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
} else if (strcmp(pciClass, "0x068001") == 0) {
// PCI device is of type "Bridge: IBM Device 04ea"
*type = ncclNvLinkDeviceBridge;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
s++;
strncpy(envValue, line+s, 1024);
setenv(envVar, envValue, 0);
char *str = getenv(envVar);
}
if (line) free(line);
fclose(file);

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -18,8 +18,9 @@
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // sleep interval in usec
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#endif
int ret;
int retries = 0;
int timedout_retries = 0;
int refused_retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
usleep(SLEEP_INT);
goto retry;
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
goto retry;
}
}
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -11,49 +11,35 @@
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
ncclResult_t getCudaPath(int cudaDev, char** path);
static ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
*path = realpath(busPath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", busPath);
return ncclSystemError;
}
return ncclSuccess;
static int getNumaId(char *path) {
char npath[PATH_MAX];
snprintf(npath, PATH_MAX, "%s/numa_node", path);
npath[PATH_MAX-1] = '\0';
int numaId = -1;
FILE *file = fopen(npath, "r");
if (file == NULL) return -1;
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
fclose(file);
return numaId;
}
enum ncclPathDist {
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_SOC = 3
PATH_PIX = 0,
PATH_PXB = 1,
PATH_PHB = 2,
PATH_NODE = 3,
PATH_SYS = 4,
PATH_ARRAY_SIZE = 5
};
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
extern const char* pathDists[PATH_ARRAY_SIZE];
static int pciDistance(char* path1, char* path2) {
int score = 0;
int depth = 0;
int same = 1;
for (int i=0; i<strlen(path1); i++) {
if (path1[i] != path2[i]) same = 0;
if (path1[i] == '/') {
depth++;
if (same == 1) score++;
}
}
if (score <= 3) return PATH_SOC;
if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX;
return PATH_PXB;
}
int pciDistance(char* path1, char* path2);
#endif

Просмотреть файл

@ -8,6 +8,7 @@
#define NCCL_TRANSPORT_H_
#include "nccl.h"
#include "devcomm.h"
#include <stdint.h>
#include "nvmlwrap.h"
@ -37,7 +38,7 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
}
}
inline void transportProxyIdle(int idle) {
sched_yield();
}
#endif

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -10,7 +10,7 @@
#include "nccl.h"
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen);
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHostHash();
uint64_t getPidHash();

Просмотреть файл

@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
#if CUDART_VERSION >= 9200
#if CUDART_VERSION >= 9020
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
return bootstrapGetUniqueId(out);
}
// Prevent compiler from optimizing out these operations
void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
}
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
CUDACHECK(cudaFree(comm->hostDevComm.channels));
CUDACHECK(cudaFree(comm->devComm));
for (int channel=0; channel<comm->nChannels; channel++)
@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
// Poison comm to try and catch a double free
commPoison(comm);
free(comm);
return ncclSuccess;
}
@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
comm->rank = rank;
comm->nRanks = ndev;
comm->rank = comm->hostDevComm.rank =rank;
comm->nRanks = comm->hostDevComm.nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
comm->doneEvent = doneEvent;
comm->llThreshold = ncclParamLlThreshold();
comm->treeThreshold = ncclParamTreeThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9200
#if CUDART_VERSION >= 9020
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
#endif
comm->fatalError = ncclSuccess;
CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
*comm->fatalDevError = ncclDevSuccess;
CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
*comm->abortFlag = 0;
comm->argsptr = &comm->args;
@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
}
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Fully duplicate the comm on the device
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
// Copy the comm on the device
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
// Copy userRanks
// Duplicate the channels on the device
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
// Copy userRanks and peers
for (int r=0; r<comm->nChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
}
// Copy the device-accessible pointer to comm->abortFlag
void *devAbortFlag;
CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
// Copy the device-accessible pointer to comm->fatalDevError
void *devFatalError;
CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
// Duplicate the dev comm on the device
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
return ncclSuccess;
}
@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
}
}
int ranks[nMasters];
int* ranks;
NCCLCHECK(ncclCalloc(&ranks, nMasters));
int i = 0, masterIndex = -1;
// Build binary tree
for (int r=0; r<nranks; r++) {
@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
tree->up = prev;
if (treeMasters[next] == 0) tree->down[0] = next;
}
free(ranks);
}
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) { ++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) { ++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) {++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) {++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
return ncclSuccess;
}
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
static ncclResult_t setCpuAffinity(int cudaDev) {
// Work within the enveloppe we were provided
// Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
// Find the subpart that is local to our GPU
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
}
#endif
// Find the CPUs that are local to the supplied GPU
cpu_set_t gpuMask;
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
cpu_set_t finalMask;
CPU_AND(&finalMask, &mask, &gpuMask);
// If those are not disjoint, try to stay local
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
}
#endif
cpu_set_t finalMask;
if (ncclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
finalMask = gpuMask;
else
// Use a subset of the GPU affinity set
CPU_AND(&finalMask, &mask, &gpuMask);
// If there is a non empty set, use it to set affinity
if (CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
comms[rank]->threadThreshold = threadThreshold;
}
struct ncclConnect* connect;
NCCLCHECK(ncclCalloc(&connect, 2*nranks));
for (int r=0; r<nrings; r++) {
struct ncclConnect connect[2*nranks];
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
}
}
free(connect);
free(allInfo);
free(rings);
free(treeIn);
@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
int savedDevice;
int rank, cudaDev;
ncclComm_t comm = NULL;
int ncclDevList[ndev];
int* ncclDevList = NULL;
NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
for (int i=0; i<ndev; i++) {
ncclDevList[i] = devlist ? devlist[i] : i;
}
cudaGetDevice(&savedDevice);
CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
for(rank=0; rank<ndev; ++rank)
comms[rank] = NULL;
@ -1118,6 +1156,7 @@ cleanup:
}
final:
free(ncclDevList);
if(wrapNvmlShutdown() != ncclSuccess)
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
cudaSetDevice(savedDevice);
@ -1128,9 +1167,11 @@ final:
static ncclResult_t commDestroy(ncclComm_t comm) {
int savedDevice;
#ifdef ENABLE_TRACE
int rank = comm->rank;
#endif
CUDACHECK(cudaGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
int rank = comm->rank;
if (savedDevice != commDevice) {
CUDACHECK(cudaSetDevice(commDevice));
@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
return ncclSuccess;
}
@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
// Try and prevent a double free of the comm struct (user error)
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
WARN("comm %p has already been destroyed", comm);
return ncclInvalidArgument;
}
return commDestroy(comm);
}

Просмотреть файл

@ -1,10 +1,10 @@
/*************************************************************************
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "checks.h"
#include "argcheck.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
int done = ncclGroupIndex;
int doneArray[ncclGroupIndex];
int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
ncclResult_t ret = ncclGroupError;

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
NCCLCHECK(getEnvThreads(nthreads));
for (int r = 0; r<*nrings; r++) {
for (int i = 0; i<nranks; i++) {
if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
}
}
return ncclSuccess;

51
src/misc/topo.cc Normal file
Просмотреть файл

@ -0,0 +1,51 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "topo.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
*path = realpath(busPath, NULL);
if (*path == NULL) {
WARN("Could not find real path of %s", busPath);
return ncclSystemError;
}
return ncclSuccess;
}
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
int pciDistance(char* path1, char* path2) {
int score = 0;
int depth = 0;
int same = 1;
for (int i=0; i<strlen(path1); i++) {
if (path1[i] != path2[i]) same = 0;
if (path1[i] == '/') {
depth++;
if (same == 1) score++;
}
}
if (score <= 3) {
/* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
int numaId1 = getNumaId(path1);
int numaId2 = getNumaId(path2);
TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
}
if (score == 4) return PATH_PHB;
if (score == depth-1) return PATH_PIX;
return PATH_PXB;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
return ncclSuccess;
}
ncclResult_t getHostName(char* hostname, int maxlen) {
ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
if (gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
return ncclSystemError;
}
int i = 0;
while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
hostname[i] = '\0';
return ncclSuccess;
}
@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
char hostname[1024];
getHostName(hostname, 1024);
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
*/
uint64_t getHostHash(void) {
char uname[1024];
// Start off with the hostname
(void) getHostName(uname, sizeof(uname));
// Start off with the full hostname
(void) getHostName(uname, sizeof(uname), '\0');
int offset = strlen(uname);
int len;
// $(readlink /proc/self/ns/uts)

Просмотреть файл

Просмотреть файл

@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
static ncclTvalue_t getTvalue(short* distances, int ndev) {
ncclTvalue_t tvalue = 0;
for (int d=0; d<ndev; d++) {
int score = 1 + PATH_SOC - distances[d];
int score = 1 + PATH_SYS - distances[d];
// Keep 3 bits of score info per dev
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
}
@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
ncclResult_t err;
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
err = ncclNetPciPath(dev, &nicPath);
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
if (nicPath) free(nicPath);
if (cudaPath) free(cudaPath);
return ncclSuccess;
@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
return -1;
}
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
int nGroups = groups[nranks-1] + 1;
int cardUsed[NET_MAX_IFS*nGroups];
for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
int *cardUsed, *starts, *ends;
NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
NCCLCHECK(ncclCalloc(&starts, nGroups));
NCCLCHECK(ncclCalloc(&ends, nGroups));
for (int ring = 0; ring<*nringsRet; ring++) {
int starts[nGroups];
int ends[nGroups];
for (int group = 0; group<nGroups; group++) {
int nranksInGroup = 0;
int nsubGroups = 0;
for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
for (int rank=0; rank<nranks; rank++)
if (groups[rank] == group) {
nranksInGroup++;
nsubGroups = std::max(subgroups[rank], nsubGroups);
}
@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
}
if (starts[group] == -1 || ends[group] == -1) {
*nringsRet = ring;
return ncclSuccess;
goto done;
}
}
// Link groups together
@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
prev[ring*nranks+starts[nextGroup]] = ends[group];
}
}
done:
free(cardUsed);
free(starts);
free(ends);
return ncclSuccess;
}
@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head < args->end) {
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
if (args->llMode) {
int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
if (size != -1) {
uint32_t flag = args->tail + 1;
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
size = nFifoLines * sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
}
} else if (args->tail < resources->hostRecvMem->tail) {
} else if (args->tail < *recvTail) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
int stepSize = args->channel->buffSize/NCCL_STEPS;
// Send through network
@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpDone;
args->state = ncclProxyOpNone;
}
}
if (args->state == ncclProxyOpDone) {
union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
resources->step += NCCL_STEPS;
resources->hostSendMem->head = resources->step;
resources->llLastCleaning = resources->step;
}
args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
volatile uint64_t* sendHead = &resources->hostSendMem->head;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
args->state = ncclProxyOpDone;
args->state = ncclProxyOpNone;
}
}
if (args->state == ncclProxyOpDone) {
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
resources->step += NCCL_STEPS;
while (resources->hostSendMem->head < resources->step);
resources->llLastCleaning = resources->step;
}
args->state = ncclProxyOpNone;
}
return ncclSuccess;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
}
int found = 0;
struct ibv_device_attr devAttr;
memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
WARN("NET/IB : Unable to query device %s", devices[d]->name);
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

Просмотреть файл

@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
/* Determine if we can communicate with the peer through p2p */
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
// Do not use P2P across root complexes by default (provided CUDA permits it)
int p2pLevel = PATH_SOC;
int p2pLevel = PATH_NODE;
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
if (peerCudaDev == -1) {
// Peer's CUDA device is not visible in this process
#if CUDART_VERSION >= 10010
// But in CUDA 10.1 we can still communicate with 'invisible' devices
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
// Check for NVLink/NVswitch including P2P access
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
if (nvlinkp2p > 0) {
*ret = nvlinkp2p;
return ncclSuccess;
}
#endif
return ncclSuccess;
}
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
// Do not detect topology if we're on the same GPU. Note this is not really supported.
if (myInfo->cudaDev == peerCudaDev) {
*ret = 1 + PATH_SOC;
*ret = 1 + PATH_SYS;
return ncclSuccess;
}
@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
if (err1 == ncclSuccess && err2 == ncclSuccess) {
int distance = pciDistance(myPath, peerPath);
if (distance < p2pLevel) {
*ret = 1 + PATH_SOC - distance;
*ret = 1 + PATH_SYS - distance;
}
}
if (err1 == ncclSuccess) free(myPath);
@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
return ncclSuccess;
}
#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
#define MAXGPUS_PCI 64
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
int nrings = 0;
ncclTvalue_t* line = matrix+current*n;
@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
}
}
} else {
int ringsSave[nRingsMax*n];
int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
int maxStep = 0;
for (int i=0; i<n; i++) {
if (inTheRing[i] == 0 && line[i] > 0) {
@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
}
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
for (int score = PATH_SOC+1; score >= minScore; score--) {
for (int score = PATH_SYS+1; score >= minScore; score--) {
int best = -1;
int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
for (int n = 0; n < nranks; n++) {
if (inRing[n]) continue;
if (values[rank*nranks+n] == score) {
@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
int start = findConnect(nranks, prev+r*nranks);
int end = findConnect(nranks, next+r*nranks);
int inRing[nranks];
int inRing[MAXGPUS_PCI];
for (int i=0; i<nranks; i++) inRing[i] = 0;
if (start == -1 && end == -1) {
@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
links += val/CONNECT_NVLINK;
}
if (rank == 0) directLinks = links;
else directLinks = std::min(directLinks, links);
else directLinks = std::min(directLinks, links);
}
if (directLinks > 0) {
// NVLink : Connect rings or create new ones
if (nranks > MAXGPUS_NVLINKP2P) {
WARN("Recursive P2P computation cannot work for >8 GPUs");
return ncclInternalError;
}
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
goto end;
}
@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
if (sendRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
CUDACHECK(cudaFree(sendRes->devMem));
free(sendRes);
return ncclSuccess;
}
@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
if (recvRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
CUDACHECK(cudaFree(recvRes->devMem));
free(recvRes);
return ncclSuccess;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
return -1;
}
#define MAXGROUPS 16
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
int nGroups = groups[nranks-1] + 1;
int starts[nGroups];
int ends[nGroups];
int starts[MAXGROUPS];
int ends[MAXGROUPS];
for (int ring = 0; ring<*nringsRet; ring++) {
int startGroup = -1, endGroup = -1;
for (int group = 0; group<nGroups; group++) {