NCCL 2.4.6-1

Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc
2019-03-14 19:39:20 -07:00 · 2019-03-14 19:39:20 -07:00 · f40ce73e89
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -1,5 +1,5 @@

- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/README.md
+++ b/README.md
@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>

 ## Copyright

-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@ -15,6 +15,7 @@ PROFAPI ?= 0
 NVCC = $(CUDA_HOME)/bin/nvcc

 CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@ -43,7 +44,8 @@ endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})

 CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS   += -Wall -Wno-sign-compare
+CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS   += -I $(CUDA_INC)
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
@ -67,7 +69,7 @@ CXXFLAGS  += -O0 -g -ggdb3
 endif

 ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
 else
 .SILENT:
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 4
-NCCL_PATCH   := 2
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
--- a/pkg/Makefile
+++ b/pkg/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/src/Makefile
+++ b/src/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@ -9,10 +9,10 @@ include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
-                misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
-		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
-                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
+                misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc

 ##### lib files
 LIBNAME     := libnccl.so
@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl

@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@

-$(OBJDIR)/%.o : %.cu
+$(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@ -107,7 +107,7 @@ install : lib
 	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/

-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
 # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
 # as the BUILDDIR variable.
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/channel.cc
+++ b/src/channel.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
  }
+
+  // Free the peer structures.
+  CUDACHECK(cudaFree(channel->devPeers));
+  free(channel->peers);
+
  return ncclSuccess;
 }
--- a/src/collectives/all_gather.cc
+++ b/src/collectives/all_gather.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/all_reduce.cc
+++ b/src/collectives/all_reduce.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/broadcast.cc
+++ b/src/collectives/broadcast.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
  const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;

--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclTree* tree = &channel->tree;
  const ssize_t size = args->N;
@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
  const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;

@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = args->nThreads;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclTree* tree = &channel->tree;
  const ssize_t size = args->N;
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
  const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;

--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -8,7 +8,7 @@
 #define NCCL_DEVICE_COMMON_H_

 #include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
 #include "nccl.h"

 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  int bid = blockIdx.x; \
  __shared__ struct ncclColl localColl; \
 \
-  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclDevComm* comm = firstColl.args.comm; \
  struct ncclChannel* channel = comm->channels+bid; \
  struct ncclColl* c; \
  if (bid == 0) { \
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -7,7 +7,7 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_

-#include "core.h"
+#include "devcomm.h"
 #include <cstdio>
 #include <cstdint>

--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "collectives.h"
 #include "common.h"

--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@ -50,7 +50,7 @@ class ncclPrimitives {
  T* sendDirectBuff[NSEND];
  const T* recvBuff[NRECV];
  T* sendBuff[NSEND];
-  struct ncclComm* comm;
+  struct ncclDevComm* comm;

  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@ -239,7 +239,7 @@ class ncclPrimitives {

 public:
  __device__ __forceinline__
-  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
    // Make sure step is updated before we read it
    __syncthreads();
@ -329,14 +329,14 @@ class ncclLLPrimitives {
  uint64_t sendConnHead;
  union ncclLLFifoLine* recvBuff[NRECV];
  union ncclLLFifoLine* sendBuff[NSEND];
-  struct ncclComm* comm;
+  struct ncclDevComm* comm;

  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
-  inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
-  inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }

  // Exit If Abort Barrier : make sure all threads exit consistently
  // Each thread sets a predicate to true if val == 1
@ -393,7 +393,10 @@ class ncclLLPrimitives {
        sendConnHead = *waitPtr;
        if (checkAbort(sendConn[i]->opCountRem)) break;
      }
-      if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+      if (fifoPtr) {
+        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        fifoPtr[sendStep[i]%NCCL_STEPS] = size;
+      }
    }
  }

@ -402,7 +405,12 @@ class ncclLLPrimitives {
    if (tid == i) *postPtr = recvStep[i];
  }

-  inline __device__ void postSend(int i) {
+  inline __device__ void postSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
    sendStep[i]++;
  }

@ -443,9 +451,10 @@ class ncclLLPrimitives {
    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
    uint64_t* srcPack = (uint64_t*)srcPtr;
    uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
    // Do multiples of 64 bits
    #pragma unroll 2
-    for (int offset=tid; offset<npack; offset+=nthreads) {
+    for (; offset<npack; offset+=nthreads) {
      // Recv : local, then intra-node, then inter-node
      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
      if (RECV) {
@ -471,7 +480,7 @@ class ncclLLPrimitives {
    }
    exitIfAbortLocalBarrier();
    FOR_RECV(postRecv);
-    FOR_SEND(postSend);
+    FOR_SEND(postSend, offset);
  }

  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
@ -514,32 +523,9 @@ class ncclLLPrimitives {
    }
  }

-  __device__ __forceinline__ void llSendCleaning(int i) {
-    if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      /* Reset all flags */
-      static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
-      static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
-      for (int s=0; s<NCCL_STEPS; s++) {
-        waitSend(i, 0);
-        for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
-          const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
-          sendPtr(i)[o].i4 = resetLine.i4;
-        }
-      }
-      if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
-    }
-  }
-
-  __device__ __forceinline__ void llRecvCleaning(int i) {
-    if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      recvStep[i] += NCCL_STEPS;
-      if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
-    }
-  }
-
 public:
  __device__ __forceinline__
-  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
    // Make sure step is updated before we read it.
    barrier();
@ -577,8 +563,6 @@ class ncclLLPrimitives {
  }

  __device__ __forceinline__ ~ncclLLPrimitives() {
-    for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
-    for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
    // Save steps for the next operation
    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
  const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;

--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x - 1;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
  const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
  struct ncclRing* ring = &channel->ring;

--- a/src/collectives/reduce.cc
+++ b/src/collectives/reduce.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/collectives/reduce_scatter.cc
+++ b/src/collectives/reduce_scatter.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
 }

 ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);

  // Set active = 2 for the last operation
  for (int r=0; r<params->gridDim.x; r++) {
@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {

 static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
  // Compute thresholds and limits that users can override
-  int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+  ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);

  // First compute nThreads
@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
  NCCLCHECK(computeColl(info, &coll, &proxyArgs));

-  info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
  if (info->comm->userStreamSet == false) {
    info->comm->userStream = info->stream;
    info->comm->userStreamSet = true;
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif
--- a/src/include/argcheck.h
+++ b/src/include/argcheck.h
@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/channel.h
+++ b/src/include/channel.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/checks.h
+++ b/src/include/checks.h
@ -1,10 +1,73 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_

-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#endif
--- a/src/include/comm.h
+++ b/src/include/comm.h
@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define MAXCHANNELS 16
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t opCount;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+    };
+    char pad4[MEM_ALIGN];
+  };
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Channels for collectives
+  int nChannels;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+
+  // Device side of the communicator
+  struct ncclDevComm *devComm;
+  // Host copy of the devComm (to free CUDA allocs)
+  struct ncclDevComm hostDevComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
+};
+
+#endif
--- a/src/include/core.h
+++ b/src/include/core.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -7,385 +7,20 @@
 #ifndef NCCL_CORE_H_
 #define NCCL_CORE_H_

-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
+#include <pthread.h>
+#include <algorithm>
 #include "nccl.h"
-#include "transport.h"
 #include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
 #include <cstdio>
-#include <algorithm> // std::min/std::max
 #include <unistd.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
-  ncclDevSuccess,
-  ncclDevAssertedMismatch,
-  ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
-  ncclColl_t coll;
-  const char* opName;
-  // NCCL Coll Args
-  const void* sendbuff;
-  void* recvbuff;
-  size_t count;
-  ncclDataType_t datatype;
-  ncclRedOp_t op;
-  int root;
-  ncclComm_t comm;
-  cudaStream_t stream;
-  // Algorithm details
-  int chunkSteps;
-  int sliceSteps;
-  // Computed later
-  ncclPattern_t pattern;
-  size_t nBytes;
-  int nstepsPerLoop;
-  int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buff;         // Local for recv, remote for send
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCountLoc; // opCount of local rank
-  uint64_t *opCountRem; // opCount of remote rank
-
-  int direct;         // Direct communication
-  void **ptrExchange; // Pointer exchange for direct communication
-
-  int *fifo;          // Size fifo for proxy
-
-  uint64_t step;      // Keep where we are
-
-  // Low latency mechanism
-  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
-  uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
-  int connected;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclTransportComm* transportComm;
-  void* transportResources; // Host-side resources
-  struct ncclConnInfo conn;
-  struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t opCount;
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[NCCL_STEPS];
-    };
-    char pad4[MEM_ALIGN];
-  };
-  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
-  char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
-  // Shortcuts for userRanks[1] and userRanks[n-1]
-  int prev;
-  int next;
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userRanks;
-  int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
-  int depth;
-  int up;
-  int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
-  struct ncclConnector send;
-  struct ncclConnector recv;
-};
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-
-      int id;
-      int nthreads;
-      int buffSize;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclColl* collectives;
-      struct ncclColl* devCollectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
-    };
-    int data[0x80];
-  };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclComm* comm;
-  uint64_t opCount;
-
-  // local and remote input, output, and buffer
-  const void * ThisInput;
-  void * ThisOutput;
-
-  // general parameters
-  size_t N;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nChannels;
-  uint16_t nThreads;
-
-  int lastChunkSize;
-};
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
-
-  struct ncclPeerInfo* peerInfo;
-
-  void* bootstrap;
-
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-  int nvmlDev; // my NVML device number
-
-  enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
-  bool userStreamSet;
-  cudaEvent_t doneEvent;
-  bool checkPointers;
-
-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
-  uint64_t opCount;
-
-  // Channels for collectives
-  int nChannels;
-  int nThreads;
-
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
-
-  // Tree algorithm threshold
-  ssize_t treeThreshold;
-
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  cudaStream_t groupStream;
-
-  // Whether there has been a fatal error in this communicator.
-  ncclResult_t fatalError;
-
-  // Error reported by GPU
-  volatile ncclDevError_t* fatalDevError;
-
-  // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
-  // On device:  this pointer has been obtained from cudaHostGetDevicePointer()
-  volatile uint32_t *abortFlag;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;
-
-  // Intra-process sync
-  int intraRank;
-  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
-  void* argsptr;
-
-  // Global proxy thread
-  pthread_t proxyThread;
-  struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        res = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);

 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
  }
 }

-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
-  memset(*ptr, 0, size);
-  *devPtr = *ptr;
-  return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
-  }
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
-  return ncclSuccess;
-}
-
 #endif // end include guard
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/debug.h
+++ b/src/include/debug.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -24,7 +24,7 @@ extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);

 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
@ -108,7 +108,7 @@ static inline void initDebug() {
          break;
        case 'h': // %h = hostname
          char hostname[1024];
-          getHostName(hostname, 1024);
+          getHostName(hostname, 1024, '.');
          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
          break;
        case 'p': // %p = pid
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define MAXTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef DEBUG_LL
+#define NCCL_LL_CLEAN_MASK 0x00000ff8
+#define NCCL_LL_FLAG_MAX   0x00001000
+#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  uint64_t step;      // Keep where we are
+
+  // Low latency mechanism
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+  uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+  struct ncclComm *comm;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclDevComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+  union {
+    struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
+      int id;
+      int nthreads;
+      int buffSize;
+
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+
+#define MAXCHANNELS 16
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile ncclDevError_t *fatalDevError;
+
+  // Channels, device side
+  struct ncclChannel* channels;
+};
+
+#endif
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -10,6 +10,12 @@
 #include "core.h"
 #include "group.h"

+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_LL_MIN_NTHREADS 64
+
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@ -4,7 +4,7 @@
 * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
 *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/info.h
+++ b/src/include/info.h
@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
+#endif
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -80,12 +80,13 @@ typedef struct {
  // Finalize connection establishment after remote peer has called connectHandle
  ncclResult_t (*accept)(void* listenComm, void** recvComm);
  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous send to a peer.
  // May return request == NULL if the call cannot be performed (or would block)
  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous recv from a peer.
  // May return request == NULL if the call cannot be performed (or would block)
  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
--- a/src/include/net.h
+++ b/src/include/net.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -18,6 +18,7 @@
 enum ncclNvLinkDeviceType {
  ncclNvLinkDeviceGpu,
  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };

 static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
  char* rPath = realpath(classPath, NULL);
  int fd;
-  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
  free(rPath);
  char pciClass[9];
  strncpy(pciClass, "0x000000", 9);
@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
  if (strcmp(pciClass, "0x068000") == 0) {
    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
    *type = ncclNvLinkDeviceGpu;
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/param.h
+++ b/src/include/param.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
    s++;
    strncpy(envValue, line+s, 1024);
    setenv(envVar, envValue, 0);
-    char *str = getenv(envVar);
  }
  if (line) free(line);
  fclose(file);
--- a/src/include/rings.h
+++ b/src/include/rings.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/shm.h
+++ b/src/include/shm.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/socket.h
+++ b/src/include/socket.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -18,8 +18,9 @@

 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT     1000  // sleep interval in usec
-#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)

 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #endif

  int ret;
-  int retries = 0;
+  int timedout_retries = 0;
+  int refused_retries = 0;
 retry:
  SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
  if (ret == 0) return ncclSuccess;
-  if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
-    INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
-    usleep(SLEEP_INT);
-    goto retry;
+  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
  }
  WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
  return ncclSystemError;
--- a/src/include/topo.h
+++ b/src/include/topo.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -11,49 +11,35 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
+#include <stdio.h>

-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+ncclResult_t getCudaPath(int cudaDev, char** path);

-static ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';
+
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
 }

 enum ncclPathDist {
-  PATH_PIX = 0,
-  PATH_PXB = 1,
-  PATH_PHB = 2,
-  PATH_SOC = 3
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
 };

-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+extern const char* pathDists[PATH_ARRAY_SIZE];

-static int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (score <= 3) return PATH_SOC;
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
+int pciDistance(char* path1, char* path2);

 #endif
--- a/src/include/transport.h
+++ b/src/include/transport.h
@ -8,6 +8,7 @@
 #define NCCL_TRANSPORT_H_

 #include "nccl.h"
+#include "devcomm.h"
 #include <stdint.h>
 #include "nvmlwrap.h"

@ -37,7 +38,7 @@ struct ncclConnect {
  char data[CONNECT_SIZE];
 };

-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };

 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
  }
 }

-inline void transportProxyIdle(int idle) {
-  sched_yield();
-}
-
 #endif
--- a/src/include/trees.h
+++ b/src/include/trees.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/include/utils.h
+++ b/src/include/utils.h
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -10,7 +10,7 @@
 #include "nccl.h"
 #include <stdint.h>

-ncclResult_t getHostName(char* hostname, int maxlen);
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getHostHash();
 uint64_t getPidHash();

--- a/src/init.cc
+++ b/src/init.cc
@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
 std::chrono::high_resolution_clock::time_point ncclEpoch;
 #endif

-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
  return bootstrapGetUniqueId(out);
 }

+// Prevent compiler from optimizing out these operations
+void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
+  comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
+}
+
 static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;
@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
  if (comm->bootstrap)
    NCCLCHECK(bootstrapClose(comm->bootstrap));

+  CUDACHECK(cudaFree(comm->hostDevComm.channels));
  CUDACHECK(cudaFree(comm->devComm));

  for (int channel=0; channel<comm->nChannels; channel++)
@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));

+  // Poison comm to try and catch a double free
+  commPoison(comm);
+
  free(comm);
  return ncclSuccess;
 }
@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  struct ncclComm* comm;
  NCCLCHECK(ncclCalloc(&comm, 1));

-  comm->rank = rank;
-  comm->nRanks = ndev;
+  comm->rank = comm->hostDevComm.rank =rank;
+  comm->nRanks = comm->hostDevComm.nRanks = ndev;
  cudaGetDevice(&comm->cudaDev);
  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);

  comm->doneEvent = doneEvent;
  comm->llThreshold = ncclParamLlThreshold();
  comm->treeThreshold = ncclParamTreeThreshold();
  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
  comm->groupCudaStream = ncclParamGroupCudaStream();
 #else
  // Don't allow the user to overload the default setting in older CUDA builds
@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 #endif
  comm->fatalError = ncclSuccess;

-  CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
  *comm->fatalDevError = ncclDevSuccess;

-  CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
  *comm->abortFlag = 0;

  comm->argsptr = &comm->args;
@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 }

 static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
-  // Copy the comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
-  // Copy userRanks
+  // Duplicate the channels on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+  // Copy userRanks and peers
  for (int r=0; r<comm->nChannels; r++) {
    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
  }
-  // Copy the device-accessible pointer to comm->abortFlag
-  void *devAbortFlag;
-  CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
-  // Copy the device-accessible pointer to comm->fatalDevError
-  void *devFatalError;
-  CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
+
+  // Duplicate the dev comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
  return ncclSuccess;
 }

@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
      }
    }

-    int ranks[nMasters];
+    int* ranks;
+    NCCLCHECK(ncclCalloc(&ranks, nMasters));
    int i = 0, masterIndex = -1;
    // Build binary tree
    for (int r=0; r<nranks; r++) {
@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
      tree->up = prev;
      if (treeMasters[next] == 0) tree->down[0] = next;
    }
+    free(ranks);
  }

  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
    if (peer == -1) continue;
    conn = &channel->peers[peer].recv;
    if (conn->connected) { ++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
  }
@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
    if (peer == -1) continue;
    conn = &channel->peers[peer].send;
    if (conn->connected) { ++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
  }
@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
    if (peer == -1) continue;
    conn = &channel->peers[peer].send;
    if (conn->connected) {++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
    NCCLCHECK(conn->transportComm->connect(&connect, conn));
    conn->connected = 1;
@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
    if (peer == -1) continue;
    conn = &channel->peers[peer].recv;
    if (conn->connected) {++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
    NCCLCHECK(conn->transportComm->connect(&connect, conn));
    conn->connected = 1;
@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
  return ncclSuccess;
 }

+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
 static ncclResult_t setCpuAffinity(int cudaDev) {
-  // Work within the enveloppe we were provided
+  // Query the CPU affinity set we were provided
  cpu_set_t mask;
  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");

-  // Find the subpart that is local to our GPU
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  // Find the CPUs that are local to the supplied GPU
  cpu_set_t gpuMask;
  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-  cpu_set_t finalMask;
-  CPU_AND(&finalMask, &mask, &gpuMask);

-  // If those are not disjoint, try to stay local
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = gpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If there is a non empty set, use it to set affinity
  if (CPU_COUNT(&finalMask)) {
    char affinityStr[sizeof(cpu_set_t)*2];
    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
    comms[rank]->threadThreshold = threadThreshold;
  }

+  struct ncclConnect* connect;
+  NCCLCHECK(ncclCalloc(&connect, 2*nranks));
  for (int r=0; r<nrings; r++) {
-    struct ncclConnect connect[2*nranks];
    int* ringRanks = rings+r*nranks;
    for (int rank=0; rank<nranks; rank++) {
      CUDACHECK(cudaSetDevice(devs[rank]));
@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
      NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
    }
  }
+  free(connect);
  free(allInfo);
  free(rings);
  free(treeIn);
@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  int savedDevice;
  int rank, cudaDev;
  ncclComm_t comm = NULL;
-  int ncclDevList[ndev];
+  int* ncclDevList = NULL;
+  NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
  for (int i=0; i<ndev; i++) {
    ncclDevList[i] = devlist ? devlist[i] : i;
  }

-  cudaGetDevice(&savedDevice);
+  CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);

  for(rank=0; rank<ndev; ++rank)
    comms[rank] = NULL;
@ -1118,6 +1156,7 @@ cleanup:
  }

 final:
+  free(ncclDevList);
  if(wrapNvmlShutdown() != ncclSuccess)
    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
  cudaSetDevice(savedDevice);
@ -1128,9 +1167,11 @@ final:

 static ncclResult_t commDestroy(ncclComm_t comm) {
  int savedDevice;
+#ifdef ENABLE_TRACE
+  int rank = comm->rank;
+#endif
  CUDACHECK(cudaGetDevice(&savedDevice));
  int commDevice = comm->cudaDev;
-  int rank = comm->rank;

  if (savedDevice != commDevice) {
    CUDACHECK(cudaSetDevice(commDevice));
@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
  if (savedDevice != commDevice)
    CUDACHECK(cudaSetDevice(savedDevice));

-  INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);

  return ncclSuccess;
 }
@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;

+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
+
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
  return commDestroy(comm);
 }

--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "checks.h"
+#include "argcheck.h"

 static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
  cudaPointerAttributes attr;
--- a/src/misc/group.cc
+++ b/src/misc/group.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
  int savedDev;
  CUDACHECK(cudaGetDevice(&savedDev));
  int done = ncclGroupIndex;
-  int doneArray[ncclGroupIndex];
+  int doneArray[MAX_ASYNC_OPS];
  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;

  ncclResult_t ret = ncclGroupError;
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/misc/rings.cc
+++ b/src/misc/rings.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
      NCCLCHECK(getEnvThreads(nthreads));
      for (int r = 0; r<*nrings; r++) {
        for (int i = 0; i<nranks; i++) {
-          if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
-          if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+          if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
+          if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
        }
      }
      return ncclSuccess;
--- a/src/misc/topo.cc
+++ b/src/misc/topo.cc
@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "topo.h"
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[BUSID_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score <= 3) {
+    /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
+    int numaId1 = getNumaId(path1);
+    int numaId2 = getNumaId(path2);
+    TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
+    return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
+  }
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
--- a/src/misc/trees.cc
+++ b/src/misc/trees.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
  return ncclSuccess;
 }

-ncclResult_t getHostName(char* hostname, int maxlen) {
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
  if (gethostname(hostname, maxlen) != 0) {
    strncpy(hostname, "unknown", maxlen);
    return ncclSystemError;
  }
  int i = 0;
-  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
  hostname[i] = '\0';
  return ncclSuccess;
 }
@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  if (ncclDebugLevel <= NCCL_LOG_NONE) return;

  char hostname[1024];
-  getHostName(hostname, 1024);
+  getHostName(hostname, 1024, '.');
  int cudaDev;
  cudaGetDevice(&cudaDev);

@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
 */
 uint64_t getHostHash(void) {
  char uname[1024];
-  // Start off with the hostname
-  (void) getHostName(uname, sizeof(uname));
+  // Start off with the full hostname
+  (void) getHostName(uname, sizeof(uname), '\0');
  int offset = strlen(uname);
  int len;
  // $(readlink /proc/self/ns/uts)
--- a/src/transport.cc
+++ b/src/transport.cc
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
 static ncclTvalue_t getTvalue(short* distances, int ndev) {
  ncclTvalue_t tvalue = 0;
  for (int d=0; d<ndev; d++) {
-    int score = 1 + PATH_SOC - distances[d];
+    int score = 1 + PATH_SYS - distances[d];
    // Keep 3 bits of score info per dev
    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
  }
@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
  ncclResult_t err;
  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
  err = ncclNetPciPath(dev, &nicPath);
-  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
  if (nicPath) free(nicPath);
  if (cudaPath) free(cudaPath);
  return ncclSuccess;
@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
  return -1;
 }

-
 ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
  int nGroups = groups[nranks-1] + 1;
-  int cardUsed[NET_MAX_IFS*nGroups];
-  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
+  int *cardUsed, *starts, *ends;
+  NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
+  NCCLCHECK(ncclCalloc(&starts, nGroups));
+  NCCLCHECK(ncclCalloc(&ends, nGroups));

  for (int ring = 0; ring<*nringsRet; ring++) {
-    int starts[nGroups];
-    int ends[nGroups];
    for (int group = 0; group<nGroups; group++) {
      int nranksInGroup = 0;
      int nsubGroups = 0;
-      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
+      for (int rank=0; rank<nranks; rank++)
+        if (groups[rank] == group) {
          nranksInGroup++;
          nsubGroups = std::max(subgroups[rank], nsubGroups);
        }
@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
      }
      if (starts[group] == -1 || ends[group] == -1) {
        *nringsRet = ring;
-        return ncclSuccess;
+        goto done;
      }
    }
    // Link groups together
@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
      prev[ring*nranks+starts[nextGroup]] = ends[group];
    }
  }
+done:
+  free(cardUsed);
+  free(starts);
+  free(ends);
  return ncclSuccess;
 }

@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
    if (args->head < args->end) {
      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
        if (args->llMode) {
          int buffSlot = args->tail%NCCL_STEPS;
          int size = sizesFifo[buffSlot];
          if (size != -1) {
-            uint32_t flag = args->tail + 1;
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
            size = nFifoLines * sizeof(union ncclLLFifoLine);
            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
              }
            }
          }
-        } else if (args->tail < resources->hostRecvMem->tail) {
+        } else if (args->tail < *recvTail) {
          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
          int stepSize = args->channel->buffSize/NCCL_STEPS;
          // Send through network
@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
    if (args->head == args->end) {
      resources->step = args->end;
      args->idle = 0;
-      args->state = ncclProxyOpDone;
+      args->state = ncclProxyOpNone;
    }
  }
-  if (args->state == ncclProxyOpDone) {
-    union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
-      resources->step += NCCL_STEPS;
-      resources->hostSendMem->head = resources->step;
-      resources->llLastCleaning = resources->step;
-    }
-    args->state = ncclProxyOpNone;
-  }
  return ncclSuccess;
 }

@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+      volatile uint64_t* sendHead = &resources->hostSendMem->head;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
        int buffSlot = args->tail%NCCL_STEPS;
        int sliceSize = stepSize * args->sliceSteps;
        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
    if (args->head == args->end) {
      resources->step = args->end;
      args->idle = 0;
-      args->state = ncclProxyOpDone;
+      args->state = ncclProxyOpNone;
    }
  }
-  if (args->state == ncclProxyOpDone) {
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->step += NCCL_STEPS;
-      while (resources->hostSendMem->head < resources->step);
-      resources->llLastCleaning = resources->step;
-    }
-    args->state = ncclProxyOpNone;
-  }
  return ncclSuccess;
 }

--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
        }
        int found = 0;
        struct ibv_device_attr devAttr;
+        memset(&devAttr, 0, sizeof(devAttr));
        if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
          WARN("NET/IB : Unable to query device %s", devices[d]->name);
          if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
 /* Determine if we can communicate with the peer through p2p */
 ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
  // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_SOC;
+  int p2pLevel = PATH_NODE;
  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();

@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc

  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-  if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+  if (peerCudaDev == -1) {
+    // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+    // But in CUDA 10.1 we can still communicate with 'invisible' devices
+    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
+    // Check for NVLink/NVswitch including P2P access
+    int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+    if (nvlinkp2p > 0) {
+      *ret = nvlinkp2p;
+      return ncclSuccess;
+    }
+#endif
+    return ncclSuccess;
+  }

  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);

  // Do not detect topology if we're on the same GPU. Note this is not really supported.
  if (myInfo->cudaDev == peerCudaDev) {
-    *ret = 1 + PATH_SOC;
+    *ret = 1 + PATH_SYS;
    return ncclSuccess;
  }

@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
  if (err1 == ncclSuccess && err2 == ncclSuccess) {
    int distance = pciDistance(myPath, peerPath);
    if (distance < p2pLevel) {
-      *ret = 1 + PATH_SOC - distance;
+      *ret = 1 + PATH_SYS - distance;
    }
  }
  if (err1 == ncclSuccess) free(myPath);
@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
  return ncclSuccess;
 }

+#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
+#define MAXGPUS_PCI 64
+
 static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
  int nrings = 0;
  ncclTvalue_t* line = matrix+current*n;
@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
      }
    }
  } else {
-    int ringsSave[nRingsMax*n];
+    int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
    int maxStep = 0;
    for (int i=0; i<n; i++) {
      if (inTheRing[i] == 0 && line[i] > 0) {
@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
 }

 static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
-  for (int score = PATH_SOC+1; score >= minScore; score--) {
+  for (int score = PATH_SYS+1; score >= minScore; score--) {
    int best = -1;
-    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+    int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
    for (int n = 0; n < nranks; n++) {
      if (inRing[n]) continue;
      if (values[rank*nranks+n] == score) {
@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
    int start = findConnect(nranks, prev+r*nranks);
    int end = findConnect(nranks, next+r*nranks);

-    int inRing[nranks];
+    int inRing[MAXGPUS_PCI];
    for (int i=0; i<nranks; i++) inRing[i] = 0;

    if (start == -1 && end == -1) {
@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
      links += val/CONNECT_NVLINK;
    }
    if (rank == 0) directLinks = links;
-    else directLinks =  std::min(directLinks, links);
+    else directLinks = std::min(directLinks, links);
  }
  if (directLinks > 0) {
    // NVLink : Connect rings or create new ones
+    if (nranks > MAXGPUS_NVLINKP2P) {
+      WARN("Recursive P2P computation cannot work for >8 GPUs");
+      return ncclInternalError;
+    }
    nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
    goto end;
  }
@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
  if (sendRes->ipcPtr)
    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
  CUDACHECK(cudaFree(sendRes->devMem));
+  free(sendRes);
  return ncclSuccess;
 }

@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
  if (recvRes->ipcPtr)
    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
  CUDACHECK(cudaFree(recvRes->devMem));
+  free(recvRes);
  return ncclSuccess;
 }

--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
  return -1;
 }

+#define MAXGROUPS 16
+
 ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
  int nGroups = groups[nranks-1] + 1;
-  int starts[nGroups];
-  int ends[nGroups];
+  int starts[MAXGROUPS];
+  int ends[MAXGROUPS];
  for (int ring = 0; ring<*nringsRet; ring++) {
    int startGroup = -1, endGroup = -1;
    for (int group = 0; group<nGroups; group++) {