From f40ce73e8987d2990e4b9ef6c75f4b3423acce78 Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 14 Mar 2019 19:39:20 -0700 Subject: [PATCH] NCCL 2.4.6-1 Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc --- LICENSE.txt | 2 +- Makefile | 2 +- README.md | 2 +- ext-net/dummy/Makefile | 2 +- ext-net/dummy/plugin.c | 2 +- makefiles/common.mk | 8 +- makefiles/formatting.mk | 2 +- makefiles/version.mk | 2 +- pkg/Makefile | 2 +- pkg/debian/Makefile | 2 +- pkg/redhat/Makefile | 2 +- pkg/srctxz/Makefile | 2 +- pkg/srctxz/create_srctxz.sh.in | 2 +- pkg/txz/Makefile | 2 +- pkg/txz/create_txz.sh.in | 2 +- src/Makefile | 20 +- src/{bootstrap.cu => bootstrap.cc} | 2 +- src/{channel.cu => channel.cc} | 7 +- .../{all_gather.cu => all_gather.cc} | 2 +- .../{all_reduce.cu => all_reduce.cc} | 2 +- .../{broadcast.cu => broadcast.cc} | 2 +- src/collectives/collectives.h | 2 +- src/collectives/device/Makefile | 2 +- src/collectives/device/all_gather.cu | 2 +- src/collectives/device/all_gather.h | 8 +- src/collectives/device/all_reduce.cu | 2 +- src/collectives/device/all_reduce.h | 12 +- src/collectives/device/broadcast.cu | 2 +- src/collectives/device/broadcast.h | 8 +- src/collectives/device/common.h | 6 +- src/collectives/device/common_kernel.h | 4 +- src/collectives/device/functions.cu | 4 +- src/collectives/device/gen_rules.sh | 2 +- src/collectives/device/primitives.h | 54 +-- src/collectives/device/reduce.cu | 2 +- src/collectives/device/reduce.h | 8 +- src/collectives/device/reduce_scatter.cu | 2 +- src/collectives/device/reduce_scatter.h | 8 +- src/collectives/{reduce.cu => reduce.cc} | 2 +- .../{reduce_scatter.cu => reduce_scatter.cc} | 2 +- src/{enqueue.cu => enqueue.cc} | 8 +- src/include/alloc.h | 51 +++ src/include/argcheck.h | 15 + src/include/bootstrap.h | 2 +- src/include/channel.h | 2 +- src/include/checks.h | 71 ++- src/include/comm.h | 127 ++++++ src/include/core.h | 423 +----------------- src/include/cpuset.h | 2 +- src/include/debug.h | 6 +- src/include/devcomm.h | 194 ++++++++ src/include/enqueue.h | 8 +- src/include/ibvwrap.h | 2 +- src/include/info.h | 45 ++ src/include/nccl_net.h | 7 +- src/include/net.h | 2 +- src/include/nvlink.h | 14 +- src/include/nvmlwrap.h | 2 +- src/include/param.h | 3 +- src/include/rings.h | 2 +- src/include/shm.h | 2 +- src/include/socket.h | 21 +- src/include/topo.h | 60 +-- src/include/transport.h | 7 +- src/include/trees.h | 2 +- src/include/utils.h | 4 +- src/{init.cu => init.cc} | 111 +++-- src/misc/{checks.cu => argcheck.cc} | 4 +- src/misc/{group.cu => group.cc} | 4 +- src/misc/{ibvwrap.cu => ibvwrap.cc} | 2 +- src/misc/{nvmlwrap.cu => nvmlwrap.cc} | 2 +- src/misc/{rings.cu => rings.cc} | 6 +- src/misc/topo.cc | 51 +++ src/misc/{trees.cu => trees.cc} | 2 +- src/misc/{utils.cu => utils.cc} | 12 +- src/{transport.cu => transport.cc} | 0 src/transport/{net.cu => net.cc} | 52 +-- src/transport/{net_ib.cu => net_ib.cc} | 3 +- .../{net_socket.cu => net_socket.cc} | 2 +- src/transport/{p2p.cu => p2p.cc} | 40 +- src/transport/{shm.cu => shm.cc} | 8 +- 81 files changed, 892 insertions(+), 692 deletions(-) rename src/{bootstrap.cu => bootstrap.cc} (99%) rename src/{channel.cu => channel.cc} (91%) rename src/collectives/{all_gather.cu => all_gather.cc} (92%) rename src/collectives/{all_reduce.cu => all_reduce.cc} (92%) rename src/collectives/{broadcast.cu => broadcast.cc} (94%) rename src/collectives/{reduce.cu => reduce.cc} (92%) rename src/collectives/{reduce_scatter.cu => reduce_scatter.cc} (92%) rename src/{enqueue.cu => enqueue.cc} (97%) create mode 100644 src/include/alloc.h create mode 100644 src/include/argcheck.h create mode 100644 src/include/comm.h create mode 100644 src/include/devcomm.h create mode 100644 src/include/info.h rename src/{init.cu => init.cc} (93%) rename src/misc/{checks.cu => argcheck.cc} (96%) rename src/misc/{group.cu => group.cc} (98%) rename src/misc/{ibvwrap.cu => ibvwrap.cc} (99%) rename src/misc/{nvmlwrap.cu => nvmlwrap.cc} (99%) rename src/misc/{rings.cu => rings.cc} (98%) create mode 100644 src/misc/topo.cc rename src/misc/{trees.cu => trees.cc} (98%) rename src/misc/{utils.cu => utils.cc} (94%) rename src/{transport.cu => transport.cc} (100%) rename src/transport/{net.cu => net.cc} (93%) rename src/transport/{net_ib.cu => net_ib.cc} (99%) rename src/transport/{net_socket.cu => net_socket.cc} (99%) rename src/transport/{p2p.cu => p2p.cc} (94%) rename src/transport/{shm.cu => shm.cc} (98%) diff --git a/LICENSE.txt b/LICENSE.txt index 3593a7a..e318c66 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/Makefile b/Makefile index 605e3bf..caed3d4 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/README.md b/README.md index fa51453..abfd1cd 100644 --- a/README.md +++ b/README.md @@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile index d1eb4c5..efa841c 100644 --- a/ext-net/dummy/Makefile +++ b/ext-net/dummy/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c index f11b365..67d7d88 100644 --- a/ext-net/dummy/plugin.c +++ b/ext-net/dummy/plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/makefiles/common.mk b/makefiles/common.mk index d0e2ca8..2ad5c73 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -15,6 +15,7 @@ PROFAPI ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) @@ -43,7 +44,8 @@ endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden -CXXFLAGS += -Wall -Wno-sign-compare +CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla +CXXFLAGS += -I $(CUDA_INC) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt @@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3 endif ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra +NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk index 4a4ab88..a543131 100644 --- a/makefiles/formatting.mk +++ b/makefiles/formatting.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/makefiles/version.mk b/makefiles/version.mk index a8c6e3a..7abaaaf 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 4 -NCCL_PATCH := 2 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/pkg/Makefile b/pkg/Makefile index 04b23da..ab6487b 100644 --- a/pkg/Makefile +++ b/pkg/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile index 439635f..7884cf2 100644 --- a/pkg/debian/Makefile +++ b/pkg/debian/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile index ffcc973..0808478 100644 --- a/pkg/redhat/Makefile +++ b/pkg/redhat/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile index ed677fe..01cab95 100644 --- a/pkg/srctxz/Makefile +++ b/pkg/srctxz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in index ae7d01f..11bdd52 100644 --- a/pkg/srctxz/create_srctxz.sh.in +++ b/pkg/srctxz/create_srctxz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile index fa587ef..b7d9aa5 100644 --- a/pkg/txz/Makefile +++ b/pkg/txz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in index 73922e0..deae854 100644 --- a/pkg/txz/create_txz.sh.in +++ b/pkg/txz/create_txz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/Makefile b/src/Makefile index fe60b11..2d32dca 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,10 +9,10 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \ - misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \ - transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ - collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \ + misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \ + transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ + collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc ##### lib files LIBNAME := libnccl.so @@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) -LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) +LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl @@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h mkdir -p $(INCDIR) cp -f $< $@ -$(OBJDIR)/%.o : %.cu +$(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@ - @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @@ -107,7 +107,7 @@ install : lib cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ -FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') +FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. diff --git a/src/bootstrap.cu b/src/bootstrap.cc similarity index 99% rename from src/bootstrap.cu rename to src/bootstrap.cc index 6b1d573..9df38e4 100644 --- a/src/bootstrap.cu +++ b/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/channel.cu b/src/channel.cc similarity index 91% rename from src/channel.cu rename to src/channel.cc index 937e84e..b053e5b 100644 --- a/src/channel.cu +++ b/src/channel.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); } + + // Free the peer structures. + CUDACHECK(cudaFree(channel->devPeers)); + free(channel->peers); + return ncclSuccess; } diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc similarity index 92% rename from src/collectives/all_gather.cu rename to src/collectives/all_gather.cc index db21dee..348c176 100644 --- a/src/collectives/all_gather.cu +++ b/src/collectives/all_gather.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc similarity index 92% rename from src/collectives/all_reduce.cu rename to src/collectives/all_reduce.cc index 1492c90..921f2de 100644 --- a/src/collectives/all_reduce.cu +++ b/src/collectives/all_reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc similarity index 94% rename from src/collectives/broadcast.cu rename to src/collectives/broadcast.cc index 6a3d0a8..042301b 100644 --- a/src/collectives/broadcast.cu +++ b/src/collectives/broadcast.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h index e6b19cb..73fe7d5 100644 --- a/src/collectives/collectives.h +++ b/src/collectives/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 8e92596..0ee587b 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu index 530bf14..109c341 100644 --- a/src/collectives/device/all_gather.cu +++ b/src/collectives/device/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 36809c9..8e78730 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu index aaa96b4..85d007e 100644 --- a/src/collectives/device/all_reduce.cu +++ b/src/collectives/device/all_reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index ea89a71..9b058cc 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; @@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->nThreads; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu index b83ee70..8c8dbb6 100644 --- a/src/collectives/device/broadcast.cu +++ b/src/collectives/device/broadcast.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index fb18312..ae8667f 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index e4aecbd..8c336bf 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,7 @@ #define NCCL_DEVICE_COMMON_H_ #include "../collectives.h" -#include "core.h" +#include "devcomm.h" #include "nccl.h" // Exit If Abort Barrier across CTA: make sure all threads exit consistently @@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int bid = blockIdx.x; \ __shared__ struct ncclColl localColl; \ \ - struct ncclComm* comm = firstColl.args.comm; \ + struct ncclDevComm* comm = firstColl.args.comm; \ struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index e1fb096..435a598 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,7 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "core.h" +#include "devcomm.h" #include #include diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index ea06b68..010c454 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "collectives.h" #include "common.h" diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index 3942c8c..4413213 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index c5aaf54..7beeaf4 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -50,7 +50,7 @@ class ncclPrimitives { T* sendDirectBuff[NSEND]; const T* recvBuff[NRECV]; T* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } @@ -239,7 +239,7 @@ class ncclPrimitives { public: __device__ __forceinline__ - ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { // Make sure step is updated before we read it __syncthreads(); @@ -329,14 +329,14 @@ class ncclLLPrimitives { uint64_t sendConnHead; union ncclLLFifoLine* recvBuff[NRECV]; union ncclLLFifoLine* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } - inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } - inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } + inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } // Exit If Abort Barrier : make sure all threads exit consistently // Each thread sets a predicate to true if val == 1 @@ -393,7 +393,10 @@ class ncclLLPrimitives { sendConnHead = *waitPtr; if (checkAbort(sendConn[i]->opCountRem)) break; } - if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; + if (fifoPtr) { + int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + fifoPtr[sendStep[i]%NCCL_STEPS] = size; + } } } @@ -402,7 +405,12 @@ class ncclLLPrimitives { if (tid == i) *postPtr = recvStep[i]; } - inline __device__ void postSend(int i) { + inline __device__ void postSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; o sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - /* Reset all flags */ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); - for (int s=0; sllLastCleaning = sendStep[i]; - } - } - - __device__ __forceinline__ void llRecvCleaning(int i) { - if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - recvStep[i] += NCCL_STEPS; - if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; - } - } - public: __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { // Make sure step is updated before we read it. barrier(); @@ -577,8 +563,6 @@ class ncclLLPrimitives { } __device__ __forceinline__ ~ncclLLPrimitives() { - for (int i=0; ibid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu index 10857ed..8b45299 100644 --- a/src/collectives/device/reduce_scatter.cu +++ b/src/collectives/device/reduce_scatter.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index c70c845..09ba56e 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc similarity index 92% rename from src/collectives/reduce.cu rename to src/collectives/reduce.cc index 302d4bc..67f2fae 100644 --- a/src/collectives/reduce.cu +++ b/src/collectives/reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc similarity index 92% rename from src/collectives/reduce_scatter.cu rename to src/collectives/reduce_scatter.cc index 4ee77ef..5ad7f5f 100644 --- a/src/collectives/reduce_scatter.cu +++ b/src/collectives/reduce_scatter.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/enqueue.cu b/src/enqueue.cc similarity index 97% rename from src/enqueue.cu rename to src/enqueue.cc index d283223..b485634 100644 --- a/src/enqueue.cu +++ b/src/enqueue.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par } ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels); + params->gridDim.x = std::min(params->gridDim.x, comm->nChannels); // Set active = 2 for the last operation for (int r=0; rgridDim.x; r++) { @@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { // Compute thresholds and limits that users can override - int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD); + ssize_t perThreadLLThreshold = std::min(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD); int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); // First compute nThreads @@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); NCCLCHECK(computeColl(info, &coll, &proxyArgs)); - info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads); + info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, coll.args.nThreads); if (info->comm->userStreamSet == false) { info->comm->userStream = info->stream; info->comm->userStreamSet = true; diff --git a/src/include/alloc.h b/src/include/alloc.h new file mode 100644 index 0000000..bcdbd18 --- /dev/null +++ b/src/include/alloc.h @@ -0,0 +1,51 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOC_H_ +#define NCCL_ALLOC_H_ + +#include "nccl.h" +#include "checks.h" +#include + +static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { + CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); + memset(*ptr, 0, size); + *devPtr = *ptr; + return ncclSuccess; +} + +static inline ncclResult_t ncclCudaHostFree(void* ptr) { + CUDACHECK(cudaFreeHost(ptr)); + return ncclSuccess; +} + +template +static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { + void* p = malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memset(p, 0, nelem*sizeof(T)); + *ptr = (T*)p; + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { + CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); + CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); + return ncclSuccess; +} + +#endif diff --git a/src/include/argcheck.h b/src/include/argcheck.h new file mode 100644 index 0000000..0d6cca7 --- /dev/null +++ b/src/include/argcheck.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ARGCHECK_H_ +#define NCCL_ARGCHECK_H_ + +#include "core.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); + +#endif diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index a1aaf50..dd7de2c 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/channel.h b/src/include/channel.h index 76c5e8a..c01d942 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/checks.h b/src/include/checks.h index bf7750e..50737b0 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,10 +1,73 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#ifndef NCCL_CHECKS_H_ +#define NCCL_CHECKS_H_ -ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); -ncclResult_t ArgsCheck(struct ncclInfo* info); +#include "debug.h" + +// Check CUDA calls +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUDACHECKGOTO(cmd, res, label) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +#include +// Check system calls +#define SYSCHECK(call, name) do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ +} while (false) + +#define SYSCHECKVAL(call, name, retval) do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (false) + +#define SYSCHECKSYNC(call, name, retval) do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ +} while(true) + +// Propagate errors up +#define NCCLCHECK(call) do { \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ +} while (0); + +#define NCCLCHECKGOTO(call, res, label) do { \ + res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#endif diff --git a/src/include/comm.h b/src/include/comm.h new file mode 100644 index 0000000..132eb39 --- /dev/null +++ b/src/include/comm.h @@ -0,0 +1,127 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COMM_H_ +#define NCCL_COMM_H_ + +#if CUDART_VERSION < 9000 +struct cudaLaunchParams { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +}; +#endif + +#define MAXCHANNELS 16 +#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ + +#define CACHE_LINE_SIZE 128 +#define MEM_ALIGN 4096 +#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ + +struct ncclSendMem { + union { + struct { + uint64_t head; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + void* ptrExchange; + char pad2[CACHE_LINE_SIZE-sizeof(void*)]; + uint64_t opCount; + }; + char pad3[MEM_ALIGN]; + }; +}; + +struct ncclRecvMem { + union { + struct { + uint64_t tail; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + uint64_t opCount; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; + }; + char pad4[MEM_ALIGN]; + }; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + char buff[1]; // Actually larger than that +}; + +struct ncclComm { + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + + void* bootstrap; + + int rank; // my rank in the communicator + int nRanks; // number of GPUs in communicator + int cudaDev; // my cuda device index + int nvmlDev; // my NVML device number + + enum { GROUP, PARALLEL } launchMode; + cudaStream_t userStream; + bool userStreamSet; + cudaEvent_t doneEvent; + bool checkPointers; + + // Counter to make sure collectives match (needed for bcast/reduce + // where syncs are not symmetric). + uint64_t opCount; + + // Channels for collectives + int nChannels; + int nThreads; + + // Low-latency algorithm threshold + ssize_t llThreshold; + ssize_t threadThreshold; + + // Tree algorithm threshold + ssize_t treeThreshold; + + // An internal CUDA stream for NCCL kernel CGMD launches + int groupCudaStream; + cudaStream_t groupStream; + + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + + // Device side of the communicator + struct ncclDevComm *devComm; + // Host copy of the devComm (to free CUDA allocs) + struct ncclDevComm hostDevComm; + + // Intra-process sync + int intraRank; + int intraRanks; + int* intraBarrier; + int intraPhase; + + // Storage for deferred intra-process launch + struct cudaLaunchParams * intraParams; + struct cudaLaunchParams *myParams; + int* intraCudaDevs; + int* intraCGMode; // Whether we can use CUDA9 CGMD or not + int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not + struct ncclColl args; + void* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; +}; + +#endif diff --git a/src/include/core.h b/src/include/core.h index d57d271..8a08b91 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,385 +7,20 @@ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ -#define NCCL_MAX_OPS 2048 -#define NCCL_STEPS 8 - +#include +#include #include "nccl.h" -#include "transport.h" #include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "transport.h" +#include "devcomm.h" +#include "comm.h" +#include "info.h" +#include "argcheck.h" #include -#include // std::min/std::max #include #include -#include - -#if CUDART_VERSION < 9000 -struct cudaLaunchParams { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; -}; -#endif - -#define MAXCHANNELS 16 -#define MAXTHREADS 256 -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - -// Channels / LL tuning -#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS MAXTHREADS -#define NCCL_LL_MIN_NTHREADS 64 - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -typedef enum { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown -} ncclPattern_t; - -typedef enum { - ncclDevSuccess, - ncclDevAssertedMismatch, - ncclDevSuspectedMismatch -} ncclDevError_t; - -// Used to pass NCCL call information between functions -struct ncclInfo { - ncclColl_t coll; - const char* opName; - // NCCL Coll Args - const void* sendbuff; - void* recvbuff; - size_t count; - ncclDataType_t datatype; - ncclRedOp_t op; - int root; - ncclComm_t comm; - cudaStream_t stream; - // Algorithm details - int chunkSteps; - int sliceSteps; - // Computed later - ncclPattern_t pattern; - size_t nBytes; - int nstepsPerLoop; - int nchunksPerLoop; -}; - -struct ncclConnInfo { - // Regular comm mechanism - char *buff; // Local for recv, remote for send - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - uint64_t *opCountLoc; // opCount of local rank - uint64_t *opCountRem; // opCount of remote rank - - int direct; // Direct communication - void **ptrExchange; // Pointer exchange for direct communication - - int *fifo; // Size fifo for proxy - - uint64_t step; // Keep where we are - - // Low latency mechanism - union ncclLLFifoLine *llBuff; // Local for recv, remote for send - uint64_t llLastCleaning; -}; - -struct ncclConnector { - int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclTransportComm* transportComm; - void* transportResources; // Host-side resources - struct ncclConnInfo conn; - struct ncclComm *comm; -}; - -#define CACHE_LINE_SIZE 128 -#define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ - -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) -#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) -#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) -#define NCCL_LL_CLEAN_FREQ 0x10000000 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t opCount; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[NCCL_STEPS]; - }; - char pad4[MEM_ALIGN]; - }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - char buff[1]; // Actually larger than that -}; - -struct ncclRing { - // Shortcuts for userRanks[1] and userRanks[n-1] - int prev; - int next; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; -}; - -#define NCCL_MAX_TREE_ARITY 3 -struct ncclTree { - int depth; - int up; - int down[NCCL_MAX_TREE_ARITY]; -}; - -struct ncclPeer { - struct ncclConnector send; - struct ncclConnector recv; -}; - -struct ncclChannel { - union { - struct { - struct ncclRing ring; - struct ncclTree tree; - - int id; - int nthreads; - int buffSize; - - // Communication structures - struct ncclPeer* peers; - struct ncclPeer* devPeers; - - // Operation list for aggregation - struct ncclColl* collectives; - struct ncclColl* devCollectives; - int collStart; - int collCount; - int collFifoHead; // Only used by GPU - int collFifoTail; // Only used by CPU - }; - int data[0x80]; - }; -}; -static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); - -/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclColl. */ -struct CollectiveArgs { - struct ncclComm* comm; - uint64_t opCount; - - // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; - - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t nThreads; - - int lastChunkSize; -}; -struct ncclColl { - union { - struct { - struct CollectiveArgs args; - uint16_t funcIndex; - uint16_t nextIndex; - uint8_t active; - }; - int data[0x10]; - }; -}; -static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); - -struct ncclComm { - struct ncclChannel channels[MAXCHANNELS]; - - struct ncclPeerInfo* peerInfo; - - void* bootstrap; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number - - enum { GROUP, PARALLEL } launchMode; - cudaStream_t userStream; - bool userStreamSet; - cudaEvent_t doneEvent; - bool checkPointers; - - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). - uint64_t opCount; - - // Channels for collectives - int nChannels; - int nThreads; - - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; - - // Tree algorithm threshold - ssize_t treeThreshold; - - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - cudaStream_t groupStream; - - // Whether there has been a fatal error in this communicator. - ncclResult_t fatalError; - - // Error reported by GPU - volatile ncclDevError_t* fatalDevError; - - // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) - // On device: this pointer has been obtained from cudaHostGetDevicePointer() - volatile uint32_t *abortFlag; - - // Device copy of the communicator - struct ncclComm *devComm; - - // Intra-process sync - int intraRank; - int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - struct cudaLaunchParams * intraParams; - struct cudaLaunchParams *myParams; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; - void* argsptr; - - // Global proxy thread - pthread_t proxyThread; - struct ncclProxyState proxyState; -}; - -// Check CUDA calls -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, res, label) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -#include -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ -} while (0); #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } -#include -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; - return ncclSuccess; -} - -static inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return ncclSuccess; -} - -template -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { - CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); - return ncclSuccess; -} - #endif // end include guard diff --git a/src/include/cpuset.h b/src/include/cpuset.h index f70d1d8..98b93de 100644 --- a/src/include/cpuset.h +++ b/src/include/cpuset.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/debug.h b/src/include/debug.h index 3acdf8c..c3e8fa0 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -24,7 +24,7 @@ extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen); +extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); @@ -108,7 +108,7 @@ static inline void initDebug() { break; case 'h': // %h = hostname char hostname[1024]; - getHostName(hostname, 1024); + getHostName(hostname, 1024, '.'); dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid diff --git a/src/include/devcomm.h b/src/include/devcomm.h new file mode 100644 index 0000000..0a2ef96 --- /dev/null +++ b/src/include/devcomm.h @@ -0,0 +1,194 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_H_ +#define NCCL_DEVICE_H_ + +#include "nccl.h" +#include + +#define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 + +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +union ncclLLFifoLine { + /* Flags have to be *after* data, because otherwise, an incomplete receive + from the network may receive the flag but not the data. + Note this is assuming that either we receive contiguous chunks of data + (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ + struct { + uint32_t data1; + uint32_t flag1; + uint32_t data2; + uint32_t flag2; + }; + uint64_t v[2]; + int4 i4; +}; + +#define MAXTHREADS 256 +#define NCCL_LL_MAX_NTHREADS MAXTHREADS +#define NUM_LINES_PER_THREAD 8 +#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) +#ifdef DEBUG_LL +#define NCCL_LL_CLEAN_MASK 0x00000ff8 +#define NCCL_LL_FLAG_MAX 0x00001000 +#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX)) +#else +#define NCCL_LL_CLEAN_MASK 0x7ffffff8 +#define NCCL_LL_FLAG(a) ((uint32_t)(a)) +#endif +// Make sure the clean mask will last for at least NCCL_NSTEPS +static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); + +struct ncclConnInfo { + // Regular comm mechanism + char *buff; // Local for recv, remote for send + uint64_t *tail; // Local for recv, remote for send + uint64_t *head; // Local for send, remote for recv + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank + + int direct; // Direct communication + void **ptrExchange; // Pointer exchange for direct communication + + int *fifo; // Size fifo for proxy + + uint64_t step; // Keep where we are + + // Low latency mechanism + union ncclLLFifoLine *llBuff; // Local for recv, remote for send + uint64_t llLastCleaning; +}; + +struct ncclConnector { + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; + void* transportResources; // Host-side resources + struct ncclConnInfo conn; + struct ncclComm *comm; +}; + +struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclDevComm; + +/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ +/* to make sure reads to host from the CUDA kernel are aligned. */ +/* Make sure to adjust padding at the end of ncclColl. */ +struct CollectiveArgs { + struct ncclDevComm* comm; + uint64_t opCount; + + // local and remote input, output, and buffer + const void * ThisInput; + void * ThisOutput; + + // general parameters + size_t N; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint16_t nThreads; + + int lastChunkSize; +}; +struct ncclColl { + union { + struct { + struct CollectiveArgs args; + uint16_t funcIndex; + uint16_t nextIndex; + uint8_t active; + }; + int data[0x10]; + }; +}; +static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); + +struct ncclChannel { + union { + struct { + struct ncclRing ring; + struct ncclTree tree; + + int id; + int nthreads; + int buffSize; + + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; + + // Operation list for aggregation + struct ncclColl* collectives; + struct ncclColl* devCollectives; + int collStart; + int collCount; + int collFifoHead; // Only used by GPU + int collFifoTail; // Only used by CPU + }; + int data[0x80]; + }; +}; +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); + +#define MAXCHANNELS 16 + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +struct ncclDevComm { + int rank; + int nRanks; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + volatile ncclDevError_t *fatalDevError; + + // Channels, device side + struct ncclChannel* channels; +}; + +#endif diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 4db7094..3b7a18c 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,6 +10,12 @@ #include "core.h" #include "group.h" +// Channels / LL tuning +#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings +#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL +#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs +#define NCCL_LL_MIN_NTHREADS 64 + ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 4f3e831..0943f99 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/info.h b/src/include/info.h new file mode 100644 index 0000000..401298a --- /dev/null +++ b/src/include/info.h @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INFO_H_ +#define NCCL_INFO_H_ + +#include "nccl.h" + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclColl_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclPattern_t pattern; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + +#endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 89edbf5..797c759 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -80,12 +80,13 @@ typedef struct { // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); - // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is diff --git a/src/include/net.h b/src/include/net.h index e75e6bb..da3ecea 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/nvlink.h b/src/include/nvlink.h index 1baf9e5..8a0f99e 100644 --- a/src/include/nvlink.h +++ b/src/include/nvlink.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,6 +18,7 @@ enum ncclNvLinkDeviceType { ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { @@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); char* rPath = realpath(classPath, NULL); int fd; - SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd); + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } free(rPath); char pciClass[9]; strncpy(pciClass, "0x000000", 9); @@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* if (strcmp(pciClass, "0x068000") == 0) { // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) *type = ncclNvLinkDeviceGpu; diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 0b6198a..f658279 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/param.h b/src/include/param.h index dd5f697..5431757 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) { s++; strncpy(envValue, line+s, 1024); setenv(envVar, envValue, 0); - char *str = getenv(envVar); } if (line) free(line); fclose(file); diff --git a/src/include/rings.h b/src/include/rings.h index 43fc595..9701f84 100644 --- a/src/include/rings.h +++ b/src/include/rings.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/shm.h b/src/include/shm.h index 4fb49cb..9cd9d05 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/socket.h b/src/include/socket.h index fb5cfc0..739c0c4 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,8 +18,9 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // sleep interval in usec -#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec) +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) /* Common socket address storage structure for IPv4/IPv6 */ union socketAddress { @@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #endif int ret; - int retries = 0; + int timedout_retries = 0; + int refused_retries = 0; retry: SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); if (ret == 0) return ncclSuccess; - if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \ - usleep(SLEEP_INT); - goto retry; + if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { + if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } } WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno)); return ncclSystemError; diff --git a/src/include/topo.h b/src/include/topo.h index e824a81..69cd100 100644 --- a/src/include/topo.h +++ b/src/include/topo.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,49 +11,35 @@ #include #include #include +#include -#define BUSID_SIZE (sizeof("0000:00:00.0")) -#define BUSID_REDUCED_SIZE (sizeof("0000:00")) +ncclResult_t getCudaPath(int cudaDev, char** path); -static ncclResult_t getCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - for (int i=0; i #include "nvmlwrap.h" @@ -37,7 +38,7 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; -enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); @@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) { } } -inline void transportProxyIdle(int idle) { - sched_yield(); -} - #endif diff --git a/src/include/trees.h b/src/include/trees.h index 1a151d1..7eadd85 100644 --- a/src/include/trees.h +++ b/src/include/trees.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/utils.h b/src/include/utils.h index 5a6a588..29b72ad 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +10,7 @@ #include "nccl.h" #include -ncclResult_t getHostName(char* hostname, int maxlen); +ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/src/init.cu b/src/init.cc similarity index 93% rename from src/init.cu rename to src/init.cc index 75822e6..80af287 100644 --- a/src/init.cu +++ b/src/init.cc @@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout; std::chrono::high_resolution_clock::time_point ncclEpoch; #endif -#if CUDART_VERSION >= 9200 +#if CUDART_VERSION >= 9020 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream @@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { return bootstrapGetUniqueId(out); } +// Prevent compiler from optimizing out these operations +void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) { + comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1; +} + static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; @@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); + CUDACHECK(cudaFree(comm->hostDevComm.channels)); CUDACHECK(cudaFree(comm->devComm)); for (int channel=0; channelnChannels; channel++) @@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) { CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); + // Poison comm to try and catch a double free + commPoison(comm); + free(comm); return ncclSuccess; } @@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); - comm->rank = rank; - comm->nRanks = ndev; + comm->rank = comm->hostDevComm.rank =rank; + comm->nRanks = comm->hostDevComm.nRanks = ndev; cudaGetDevice(&comm->cudaDev); getNvmlDevice(comm->cudaDev, &comm->nvmlDev); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); comm->doneEvent = doneEvent; comm->llThreshold = ncclParamLlThreshold(); comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9200 +#if CUDART_VERSION >= 9020 comm->groupCudaStream = ncclParamGroupCudaStream(); #else // Don't allow the user to overload the default setting in older CUDA builds @@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { #endif comm->fatalError = ncclSuccess; - CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped)); + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t))); *comm->fatalDevError = ncclDevSuccess; - CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped)); + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t))); *comm->abortFlag = 0; comm->argsptr = &comm->args; @@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { } static ncclResult_t devCommSetup(ncclComm_t comm) { - // Fully duplicate the comm on the device - NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); - // Copy the comm on the device - NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); - // Copy userRanks + // Duplicate the channels on the device + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels)); + + // Copy userRanks and peers for (int r=0; rnChannels; r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); } - // Copy the device-accessible pointer to comm->abortFlag - void *devAbortFlag; - CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice)); - // Copy the device-accessible pointer to comm->fatalDevError - void *devFatalError; - CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice)); + + // Duplicate the dev comm on the device + NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); + NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1)); return ncclSuccess; } @@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, } } - int ranks[nMasters]; + int* ranks; + NCCLCHECK(ncclCalloc(&ranks, nMasters)); int i = 0, masterIndex = -1; // Build binary tree for (int r=0; rup = prev; if (treeMasters[next] == 0) tree->down[0] = next; } + free(ranks); } TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); @@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].recv; if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } @@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].send; if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } @@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].send; if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); NCCLCHECK(conn->transportComm->connect(&connect, conn)); conn->connected = 1; @@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].recv; if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); NCCLCHECK(conn->transportComm->connect(&connect, conn)); conn->connected = 1; @@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { return ncclSuccess; } +NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); + static ncclResult_t setCpuAffinity(int cudaDev) { - // Work within the enveloppe we were provided + // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); - // Find the subpart that is local to our GPU +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + // Find the CPUs that are local to the supplied GPU cpu_set_t gpuMask; NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); - cpu_set_t finalMask; - CPU_AND(&finalMask, &mask, &gpuMask); - // If those are not disjoint, try to stay local +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr)); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + cpu_set_t finalMask; + if (ncclParamIgnoreCpuAffinity()) + // Ignore the CPU affinity set and use the GPU one instead + finalMask = gpuMask; + else + // Use a subset of the GPU affinity set + CPU_AND(&finalMask, &mask, &gpuMask); + + // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); @@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, comms[rank]->threadThreshold = threadThreshold; } + struct ncclConnect* connect; + NCCLCHECK(ncclCalloc(&connect, 2*nranks)); for (int r=0; rtransportComm->connect(connect+ring->next*2+0, send)); } } + free(connect); free(allInfo); free(rings); free(treeIn); @@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { int savedDevice; int rank, cudaDev; ncclComm_t comm = NULL; - int ncclDevList[ndev]; + int* ncclDevList = NULL; + NCCLCHECK(ncclCalloc(&ncclDevList, ndev)); for (int i=0; irank; +#endif CUDACHECK(cudaGetDevice(&savedDevice)); int commDevice = comm->cudaDev; - int rank = comm->rank; if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); @@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) { if (savedDevice != commDevice) CUDACHECK(cudaSetDevice(savedDevice)); - INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); return ncclSuccess; } @@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev); + + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + return commDestroy(comm); } diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc similarity index 96% rename from src/misc/checks.cu rename to src/misc/argcheck.cc index a07e577..364f041 100644 --- a/src/misc/checks.cu +++ b/src/misc/argcheck.cc @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "checks.h" +#include "argcheck.h" static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; diff --git a/src/misc/group.cu b/src/misc/group.cc similarity index 98% rename from src/misc/group.cu rename to src/misc/group.cc index c428a22..7bc64cd 100644 --- a/src/misc/group.cu +++ b/src/misc/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() { int savedDev; CUDACHECK(cudaGetDevice(&savedDev)); int done = ncclGroupIndex; - int doneArray[ncclGroupIndex]; + int doneArray[MAX_ASYNC_OPS]; for (int i=0; i= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF static ncclTvalue_t getTvalue(short* distances, int ndev) { ncclTvalue_t tvalue = 0; for (int d=0; dhead < args->end) { if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->hostRecvMem->tail; if (args->llMode) { int buffSlot = args->tail%NCCL_STEPS; int size = sizesFifo[buffSlot]; if (size != -1) { - uint32_t flag = args->tail + 1; + uint32_t flag = NCCL_LL_FLAG(args->tail + 1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); size = nFifoLines * sizeof(union ncclLLFifoLine); union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; @@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } - } else if (args->tail < resources->hostRecvMem->tail) { + } else if (args->tail < *recvTail) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; int stepSize = args->channel->buffSize/NCCL_STEPS; // Send through network @@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; + args->state = ncclProxyOpNone; } } - if (args->state == ncclProxyOpDone) { - union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff; - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step; - resources->step += NCCL_STEPS; - resources->hostSendMem->head = resources->step; - resources->llLastCleaning = resources->step; - } - args->state = ncclProxyOpNone; - } return ncclSuccess; } @@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; - if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + volatile uint64_t* sendHead = &resources->hostSendMem->head; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; int sliceSize = stepSize * args->sliceSteps; NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); @@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; + args->state = ncclProxyOpNone; } } - if (args->state == ncclProxyOpDone) { - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->step += NCCL_STEPS; - while (resources->hostSendMem->head < resources->step); - resources->llLastCleaning = resources->step; - } - args->state = ncclProxyOpNone; - } return ncclSuccess; } diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc similarity index 99% rename from src/transport/net_ib.cu rename to src/transport/net_ib.cc index f7c574b..de72f89 100644 --- a/src/transport/net_ib.cu +++ b/src/transport/net_ib.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } int found = 0; struct ibv_device_attr devAttr; + memset(&devAttr, 0, sizeof(devAttr)); if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { WARN("NET/IB : Unable to query device %s", devices[d]->name); if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cc similarity index 99% rename from src/transport/net_socket.cu rename to src/transport/net_socket.cc index 0464b43..9958936 100644 --- a/src/transport/net_socket.cu +++ b/src/transport/net_socket.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/transport/p2p.cu b/src/transport/p2p.cc similarity index 94% rename from src/transport/p2p.cu rename to src/transport/p2p.cc index 9f3e0b6..42b549e 100644 --- a/src/transport/p2p.cu +++ b/src/transport/p2p.cc @@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) { /* Determine if we can communicate with the peer through p2p */ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_SOC; + int p2pLevel = PATH_NODE; if (ncclParamP2pDisable() == 1) p2pLevel = 0; if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); @@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int peerCudaDev = busIdToCudaDev(peerInfo->busId); - if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process + if (peerCudaDev == -1) { + // Peer's CUDA device is not visible in this process +#if CUDART_VERSION >= 10010 + // But in CUDA 10.1 we can still communicate with 'invisible' devices + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId); + // Check for NVLink/NVswitch including P2P access + int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); + if (nvlinkp2p > 0) { + *ret = nvlinkp2p; + return ncclSuccess; + } +#endif + return ncclSuccess; + } TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); // Do not detect topology if we're on the same GPU. Note this is not really supported. if (myInfo->cudaDev == peerCudaDev) { - *ret = 1 + PATH_SOC; + *ret = 1 + PATH_SYS; return ncclSuccess; } @@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc if (err1 == ncclSuccess && err2 == ncclSuccess) { int distance = pciDistance(myPath, peerPath); if (distance < p2pLevel) { - *ret = 1 + PATH_SOC - distance; + *ret = 1 + PATH_SYS - distance; } } if (err1 == ncclSuccess) free(myPath); @@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc return ncclSuccess; } +#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway +#define MAXGPUS_PCI 64 + static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) { int nrings = 0; ncclTvalue_t* line = matrix+current*n; @@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR } } } else { - int ringsSave[nRingsMax*n]; + int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P]; int maxStep = 0; for (int i=0; i 0) { @@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin } static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) { - for (int score = PATH_SOC+1; score >= minScore; score--) { + for (int score = PATH_SYS+1; score >= minScore; score--) { int best = -1; - int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end + int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end for (int n = 0; n < nranks; n++) { if (inRing[n]) continue; if (values[rank*nranks+n] == score) { @@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int start = findConnect(nranks, prev+r*nranks); int end = findConnect(nranks, next+r*nranks); - int inRing[nranks]; + int inRing[MAXGPUS_PCI]; for (int i=0; i 0) { // NVLink : Connect rings or create new ones + if (nranks > MAXGPUS_NVLINKP2P) { + WARN("Recursive P2P computation cannot work for >8 GPUs"); + return ncclInternalError; + } nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads); goto end; } @@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) { if (sendRes->ipcPtr) CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); CUDACHECK(cudaFree(sendRes->devMem)); + free(sendRes); return ncclSuccess; } @@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) { if (recvRes->ipcPtr) CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); CUDACHECK(cudaFree(recvRes->devMem)); + free(recvRes); return ncclSuccess; } diff --git a/src/transport/shm.cu b/src/transport/shm.cc similarity index 98% rename from src/transport/shm.cu rename to src/transport/shm.cc index 83cc9d1..2ec5f23 100644 --- a/src/transport/shm.cu +++ b/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) return -1; } +#define MAXGROUPS 16 + ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { if (*nringsRet == MAXCHANNELS) *nringsRet = 1; int nGroups = groups[nranks-1] + 1; - int starts[nGroups]; - int ends[nGroups]; + int starts[MAXGROUPS]; + int ends[MAXGROUPS]; for (int ring = 0; ring<*nringsRet; ring++) { int startGroup = -1, endGroup = -1; for (int group = 0; group