From f40ce73e8987d2990e4b9ef6c75f4b3423acce78 Mon Sep 17 00:00:00 2001
From: David Addison <daddison@nvidia.com>
Date: Thu, 14 Mar 2019 19:39:20 -0700
Subject: [PATCH] NCCL 2.4.6-1

    Added detection of IBM/Power NVLink bridge device.
    Add NUMA support to PCI distance calculations.
    Added NCCL_IGNORE_CPU_AFFINITY env var.
    Fix memory leaks; GithubIssue#180
    Compiler warning fix; GithubIssue#178
    Replace non-standard variable length arrays. GithubIssue#171
    Fix Tree+Shared Memory crash. GithubPR#185
    Fix LL cleanup hang during long running DL jobs.
    Fix NCCL_RINGS environment variable handling.
    Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191
    Improve bootstrap socket connection reliability at scale.
    Fix hostname hashing issue. GithubIssue#187
    Code cleanup to rename all non device files from *.cu to *.cc
---
 LICENSE.txt                                   |   2 +-
 Makefile                                      |   2 +-
 README.md                                     |   2 +-
 ext-net/dummy/Makefile                        |   2 +-
 ext-net/dummy/plugin.c                        |   2 +-
 makefiles/common.mk                           |   8 +-
 makefiles/formatting.mk                       |   2 +-
 makefiles/version.mk                          |   2 +-
 pkg/Makefile                                  |   2 +-
 pkg/debian/Makefile                           |   2 +-
 pkg/redhat/Makefile                           |   2 +-
 pkg/srctxz/Makefile                           |   2 +-
 pkg/srctxz/create_srctxz.sh.in                |   2 +-
 pkg/txz/Makefile                              |   2 +-
 pkg/txz/create_txz.sh.in                      |   2 +-
 src/Makefile                                  |  20 +-
 src/{bootstrap.cu => bootstrap.cc}            |   2 +-
 src/{channel.cu => channel.cc}                |   7 +-
 .../{all_gather.cu => all_gather.cc}          |   2 +-
 .../{all_reduce.cu => all_reduce.cc}          |   2 +-
 .../{broadcast.cu => broadcast.cc}            |   2 +-
 src/collectives/collectives.h                 |   2 +-
 src/collectives/device/Makefile               |   2 +-
 src/collectives/device/all_gather.cu          |   2 +-
 src/collectives/device/all_gather.h           |   8 +-
 src/collectives/device/all_reduce.cu          |   2 +-
 src/collectives/device/all_reduce.h           |  12 +-
 src/collectives/device/broadcast.cu           |   2 +-
 src/collectives/device/broadcast.h            |   8 +-
 src/collectives/device/common.h               |   6 +-
 src/collectives/device/common_kernel.h        |   4 +-
 src/collectives/device/functions.cu           |   4 +-
 src/collectives/device/gen_rules.sh           |   2 +-
 src/collectives/device/primitives.h           |  54 +--
 src/collectives/device/reduce.cu              |   2 +-
 src/collectives/device/reduce.h               |   8 +-
 src/collectives/device/reduce_scatter.cu      |   2 +-
 src/collectives/device/reduce_scatter.h       |   8 +-
 src/collectives/{reduce.cu => reduce.cc}      |   2 +-
 .../{reduce_scatter.cu => reduce_scatter.cc}  |   2 +-
 src/{enqueue.cu => enqueue.cc}                |   8 +-
 src/include/alloc.h                           |  51 +++
 src/include/argcheck.h                        |  15 +
 src/include/bootstrap.h                       |   2 +-
 src/include/channel.h                         |   2 +-
 src/include/checks.h                          |  71 ++-
 src/include/comm.h                            | 127 ++++++
 src/include/core.h                            | 423 +-----------------
 src/include/cpuset.h                          |   2 +-
 src/include/debug.h                           |   6 +-
 src/include/devcomm.h                         | 194 ++++++++
 src/include/enqueue.h                         |   8 +-
 src/include/ibvwrap.h                         |   2 +-
 src/include/info.h                            |  45 ++
 src/include/nccl_net.h                        |   7 +-
 src/include/net.h                             |   2 +-
 src/include/nvlink.h                          |  14 +-
 src/include/nvmlwrap.h                        |   2 +-
 src/include/param.h                           |   3 +-
 src/include/rings.h                           |   2 +-
 src/include/shm.h                             |   2 +-
 src/include/socket.h                          |  21 +-
 src/include/topo.h                            |  60 +--
 src/include/transport.h                       |   7 +-
 src/include/trees.h                           |   2 +-
 src/include/utils.h                           |   4 +-
 src/{init.cu => init.cc}                      | 111 +++--
 src/misc/{checks.cu => argcheck.cc}           |   4 +-
 src/misc/{group.cu => group.cc}               |   4 +-
 src/misc/{ibvwrap.cu => ibvwrap.cc}           |   2 +-
 src/misc/{nvmlwrap.cu => nvmlwrap.cc}         |   2 +-
 src/misc/{rings.cu => rings.cc}               |   6 +-
 src/misc/topo.cc                              |  51 +++
 src/misc/{trees.cu => trees.cc}               |   2 +-
 src/misc/{utils.cu => utils.cc}               |  12 +-
 src/{transport.cu => transport.cc}            |   0
 src/transport/{net.cu => net.cc}              |  52 +--
 src/transport/{net_ib.cu => net_ib.cc}        |   3 +-
 .../{net_socket.cu => net_socket.cc}          |   2 +-
 src/transport/{p2p.cu => p2p.cc}              |  40 +-
 src/transport/{shm.cu => shm.cc}              |   8 +-
 81 files changed, 892 insertions(+), 692 deletions(-)
 rename src/{bootstrap.cu => bootstrap.cc} (99%)
 rename src/{channel.cu => channel.cc} (91%)
 rename src/collectives/{all_gather.cu => all_gather.cc} (92%)
 rename src/collectives/{all_reduce.cu => all_reduce.cc} (92%)
 rename src/collectives/{broadcast.cu => broadcast.cc} (94%)
 rename src/collectives/{reduce.cu => reduce.cc} (92%)
 rename src/collectives/{reduce_scatter.cu => reduce_scatter.cc} (92%)
 rename src/{enqueue.cu => enqueue.cc} (97%)
 create mode 100644 src/include/alloc.h
 create mode 100644 src/include/argcheck.h
 create mode 100644 src/include/comm.h
 create mode 100644 src/include/devcomm.h
 create mode 100644 src/include/info.h
 rename src/{init.cu => init.cc} (93%)
 rename src/misc/{checks.cu => argcheck.cc} (96%)
 rename src/misc/{group.cu => group.cc} (98%)
 rename src/misc/{ibvwrap.cu => ibvwrap.cc} (99%)
 rename src/misc/{nvmlwrap.cu => nvmlwrap.cc} (99%)
 rename src/misc/{rings.cu => rings.cc} (98%)
 create mode 100644 src/misc/topo.cc
 rename src/misc/{trees.cu => trees.cc} (98%)
 rename src/misc/{utils.cu => utils.cc} (94%)
 rename src/{transport.cu => transport.cc} (100%)
 rename src/transport/{net.cu => net.cc} (93%)
 rename src/transport/{net_ib.cu => net_ib.cc} (99%)
 rename src/transport/{net_socket.cu => net_socket.cc} (99%)
 rename src/transport/{p2p.cu => p2p.cc} (94%)
 rename src/transport/{shm.cu => shm.cc} (98%)

diff --git a/LICENSE.txt b/LICENSE.txt
index 3593a7a..e318c66 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index 605e3bf..caed3d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/README.md b/README.md
index fa51453..abfd1cd 100644
--- a/README.md
+++ b/README.md
@@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
 
 ## Copyright
 
-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile
index d1eb4c5..efa841c 100644
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c
index f11b365..67d7d88 100644
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/makefiles/common.mk b/makefiles/common.mk
index d0e2ca8..2ad5c73 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -15,6 +15,7 @@ PROFAPI ?= 0
 NVCC = $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -43,7 +44,8 @@ endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})
 
 CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS   += -Wall -Wno-sign-compare
+CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS   += -I $(CUDA_INC)
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
@@ -67,7 +69,7 @@ CXXFLAGS  += -O0 -g -ggdb3
 endif
 
 ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
 else
 .SILENT:
diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk
index 4a4ab88..a543131 100644
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/makefiles/version.mk b/makefiles/version.mk
index a8c6e3a..7abaaaf 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 4
-NCCL_PATCH   := 2
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/pkg/Makefile b/pkg/Makefile
index 04b23da..ab6487b 100644
--- a/pkg/Makefile
+++ b/pkg/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
index 439635f..7884cf2 100644
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile
index ffcc973..0808478 100644
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index ed677fe..01cab95 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index ae7d01f..11bdd52 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile
index fa587ef..b7d9aa5 100644
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
index 73922e0..deae854 100644
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/Makefile b/src/Makefile
index fe60b11..2d32dca 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
-                misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
-		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
-                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
+                misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
@@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
 
@@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 
-$(OBJDIR)/%.o : %.cu
+$(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@@ -107,7 +107,7 @@ install : lib
 	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
 
-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
 # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
 # as the BUILDDIR variable.
diff --git a/src/bootstrap.cu b/src/bootstrap.cc
similarity index 99%
rename from src/bootstrap.cu
rename to src/bootstrap.cc
index 6b1d573..9df38e4 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/channel.cu b/src/channel.cc
similarity index 91%
rename from src/channel.cu
rename to src/channel.cc
index 937e84e..b053e5b 100644
--- a/src/channel.cu
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
     if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
     if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
   }
+
+  // Free the peer structures.
+  CUDACHECK(cudaFree(channel->devPeers));
+  free(channel->peers);
+
   return ncclSuccess;
 }
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc
similarity index 92%
rename from src/collectives/all_gather.cu
rename to src/collectives/all_gather.cc
index db21dee..348c176 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc
similarity index 92%
rename from src/collectives/all_reduce.cu
rename to src/collectives/all_reduce.cc
index 1492c90..921f2de 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc
similarity index 94%
rename from src/collectives/broadcast.cu
rename to src/collectives/broadcast.cc
index 6a3d0a8..042301b 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
index e6b19cb..73fe7d5 100644
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 8e92596..0ee587b 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 530bf14..109c341 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 36809c9..8e78730 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
@@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index aaa96b4..85d007e 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea89a71..9b058cc 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
@@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclTree* tree = &channel->tree;
   const ssize_t size = args->N;
@@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = args->nThreads;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclTree* tree = &channel->tree;
   const ssize_t size = args->N;
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index b83ee70..8c8dbb6 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index fb18312..ae8667f 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
@@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e4aecbd..8c336bf 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,7 +8,7 @@
 #define NCCL_DEVICE_COMMON_H_
 
 #include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
 #include "nccl.h"
 
 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
@@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   int bid = blockIdx.x; \
   __shared__ struct ncclColl localColl; \
  \
-  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclDevComm* comm = firstColl.args.comm; \
   struct ncclChannel* channel = comm->channels+bid; \
   struct ncclColl* c; \
   if (bid == 0) { \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index e1fb096..435a598 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_
 
-#include "core.h"
+#include "devcomm.h"
 #include <cstdio>
 #include <cstdint>
 
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index ea06b68..010c454 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "collectives.h"
 #include "common.h"
 
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 3942c8c..4413213 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c5aaf54..7beeaf4 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -50,7 +50,7 @@ class ncclPrimitives {
   T* sendDirectBuff[NSEND];
   const T* recvBuff[NRECV];
   T* sendBuff[NSEND];
-  struct ncclComm* comm;
+  struct ncclDevComm* comm;
 
   inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
   inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@@ -239,7 +239,7 @@ class ncclPrimitives {
 
  public:
   __device__ __forceinline__
-  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
     : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
     // Make sure step is updated before we read it
     __syncthreads();
@@ -329,14 +329,14 @@ class ncclLLPrimitives {
   uint64_t sendConnHead;
   union ncclLLFifoLine* recvBuff[NRECV];
   union ncclLLFifoLine* sendBuff[NSEND];
-  struct ncclComm* comm;
+  struct ncclDevComm* comm;
 
   inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
   inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
   inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
   inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
-  inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
-  inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
 
   // Exit If Abort Barrier : make sure all threads exit consistently
   // Each thread sets a predicate to true if val == 1
@@ -393,7 +393,10 @@ class ncclLLPrimitives {
         sendConnHead = *waitPtr;
         if (checkAbort(sendConn[i]->opCountRem)) break;
       }
-      if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+      if (fifoPtr) {
+        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        fifoPtr[sendStep[i]%NCCL_STEPS] = size;
+      }
     }
   }
 
@@ -402,7 +405,12 @@ class ncclLLPrimitives {
     if (tid == i) *postPtr = recvStep[i];
   }
 
-  inline __device__ void postSend(int i) {
+  inline __device__ void postSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
     sendStep[i]++;
   }
 
@@ -443,9 +451,10 @@ class ncclLLPrimitives {
     uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
     uint64_t* srcPack = (uint64_t*)srcPtr;
     uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
     // Do multiples of 64 bits
     #pragma unroll 2
-    for (int offset=tid; offset<npack; offset+=nthreads) {
+    for (; offset<npack; offset+=nthreads) {
       // Recv : local, then intra-node, then inter-node
       uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
       if (RECV) {
@@ -471,7 +480,7 @@ class ncclLLPrimitives {
     }
     exitIfAbortLocalBarrier();
     FOR_RECV(postRecv);
-    FOR_SEND(postSend);
+    FOR_SEND(postSend, offset);
   }
 
   __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
@@ -514,32 +523,9 @@ class ncclLLPrimitives {
     }
   }
 
-  __device__ __forceinline__ void llSendCleaning(int i) {
-    if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      /* Reset all flags */
-      static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
-      static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
-      for (int s=0; s<NCCL_STEPS; s++) {
-        waitSend(i, 0);
-        for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
-          const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
-          sendPtr(i)[o].i4 = resetLine.i4;
-        }
-      }
-      if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
-    }
-  }
-
-  __device__ __forceinline__ void llRecvCleaning(int i) {
-    if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      recvStep[i] += NCCL_STEPS;
-      if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
-    }
-  }
-
  public:
   __device__ __forceinline__
-  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
     : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
     // Make sure step is updated before we read it.
     barrier();
@@ -577,8 +563,6 @@ class ncclLLPrimitives {
   }
 
   __device__ __forceinline__ ~ncclLLPrimitives() {
-    for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
-    for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
     // Save steps for the next operation
     for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
     for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index 1ef66d4..a2caac5 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 302d053..d2d5d3b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
@@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 10857ed..8b45299 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c70c845..09ba56e 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
@@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc
similarity index 92%
rename from src/collectives/reduce.cu
rename to src/collectives/reduce.cc
index 302d4bc..67f2fae 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc
similarity index 92%
rename from src/collectives/reduce_scatter.cu
rename to src/collectives/reduce_scatter.cc
index 4ee77ef..5ad7f5f 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/enqueue.cu b/src/enqueue.cc
similarity index 97%
rename from src/enqueue.cu
rename to src/enqueue.cc
index d283223..b485634 100644
--- a/src/enqueue.cu
+++ b/src/enqueue.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
 }
 
 ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
 
   // Set active = 2 for the last operation
   for (int r=0; r<params->gridDim.x; r++) {
@@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
 
 static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
   // Compute thresholds and limits that users can override
-  int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+  ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
   int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
 
   // First compute nThreads
@@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
   memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
   NCCLCHECK(computeColl(info, &coll, &proxyArgs));
 
-  info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
   if (info->comm->userStreamSet == false) {
     info->comm->userStream = info->stream;
     info->comm->userStreamSet = true;
diff --git a/src/include/alloc.h b/src/include/alloc.h
new file mode 100644
index 0000000..bcdbd18
--- /dev/null
+++ b/src/include/alloc.h
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/argcheck.h b/src/include/argcheck.h
new file mode 100644
index 0000000..0d6cca7
--- /dev/null
+++ b/src/include/argcheck.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index a1aaf50..dd7de2c 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/channel.h b/src/include/channel.h
index 76c5e8a..c01d942 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/checks.h b/src/include/checks.h
index bf7750e..50737b0 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,10 +1,73 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
 
-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#endif
diff --git a/src/include/comm.h b/src/include/comm.h
new file mode 100644
index 0000000..132eb39
--- /dev/null
+++ b/src/include/comm.h
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define MAXCHANNELS 16
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t opCount;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+    };
+    char pad4[MEM_ALIGN];
+  };
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Channels for collectives
+  int nChannels;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+
+  // Device side of the communicator
+  struct ncclDevComm *devComm;
+  // Host copy of the devComm (to free CUDA allocs)
+  struct ncclDevComm hostDevComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
+};
+
+#endif
diff --git a/src/include/core.h b/src/include/core.h
index d57d271..8a08b91 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,385 +7,20 @@
 #ifndef NCCL_CORE_H_
 #define NCCL_CORE_H_
 
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
+#include <pthread.h>
+#include <algorithm>
 #include "nccl.h"
-#include "transport.h"
 #include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
 #include <cstdio>
-#include <algorithm> // std::min/std::max
 #include <unistd.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
-  ncclDevSuccess,
-  ncclDevAssertedMismatch,
-  ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
-  ncclColl_t coll;
-  const char* opName;
-  // NCCL Coll Args
-  const void* sendbuff;
-  void* recvbuff;
-  size_t count;
-  ncclDataType_t datatype;
-  ncclRedOp_t op;
-  int root;
-  ncclComm_t comm;
-  cudaStream_t stream;
-  // Algorithm details
-  int chunkSteps;
-  int sliceSteps;
-  // Computed later
-  ncclPattern_t pattern;
-  size_t nBytes;
-  int nstepsPerLoop;
-  int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buff;         // Local for recv, remote for send
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCountLoc; // opCount of local rank
-  uint64_t *opCountRem; // opCount of remote rank
-
-  int direct;         // Direct communication
-  void **ptrExchange; // Pointer exchange for direct communication
-
-  int *fifo;          // Size fifo for proxy
-
-  uint64_t step;      // Keep where we are
-
-  // Low latency mechanism
-  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
-  uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
-  int connected;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclTransportComm* transportComm;
-  void* transportResources; // Host-side resources
-  struct ncclConnInfo conn;
-  struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t opCount;
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[NCCL_STEPS];
-    };
-    char pad4[MEM_ALIGN];
-  };
-  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
-  char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
-  // Shortcuts for userRanks[1] and userRanks[n-1]
-  int prev;
-  int next;
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userRanks;
-  int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
-  int depth;
-  int up;
-  int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
-  struct ncclConnector send;
-  struct ncclConnector recv;
-};
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-
-      int id;
-      int nthreads;
-      int buffSize;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclColl* collectives;
-      struct ncclColl* devCollectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
-    };
-    int data[0x80];
-  };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclComm* comm;
-  uint64_t opCount;
-
-  // local and remote input, output, and buffer
-  const void * ThisInput;
-  void * ThisOutput;
-
-  // general parameters
-  size_t N;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nChannels;
-  uint16_t nThreads;
-
-  int lastChunkSize;
-};
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
-
-  struct ncclPeerInfo* peerInfo;
-
-  void* bootstrap;
-
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-  int nvmlDev; // my NVML device number
-
-  enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
-  bool userStreamSet;
-  cudaEvent_t doneEvent;
-  bool checkPointers;
-
-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
-  uint64_t opCount;
-
-  // Channels for collectives
-  int nChannels;
-  int nThreads;
-
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
-
-  // Tree algorithm threshold
-  ssize_t treeThreshold;
-
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  cudaStream_t groupStream;
-
-  // Whether there has been a fatal error in this communicator.
-  ncclResult_t fatalError;
-
-  // Error reported by GPU
-  volatile ncclDevError_t* fatalDevError;
-
-  // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
-  // On device:  this pointer has been obtained from cudaHostGetDevicePointer()
-  volatile uint32_t *abortFlag;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;
-
-  // Intra-process sync
-  int intraRank;
-  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
-  void* argsptr;
-
-  // Global proxy thread
-  pthread_t proxyThread;
-  struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        res = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
 
 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
   }
 }
 
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
-  memset(*ptr, 0, size);
-  *devPtr = *ptr;
-  return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
-  }
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
-  return ncclSuccess;
-}
-
 #endif // end include guard
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index f70d1d8..98b93de 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/debug.h b/src/include/debug.h
index 3acdf8c..c3e8fa0 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -24,7 +24,7 @@ extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
 
 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
@@ -108,7 +108,7 @@ static inline void initDebug() {
           break;
         case 'h': // %h = hostname
           char hostname[1024];
-          getHostName(hostname, 1024);
+          getHostName(hostname, 1024, '.');
           dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
           break;
         case 'p': // %p = pid
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
new file mode 100644
index 0000000..0a2ef96
--- /dev/null
+++ b/src/include/devcomm.h
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define MAXTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef DEBUG_LL
+#define NCCL_LL_CLEAN_MASK 0x00000ff8
+#define NCCL_LL_FLAG_MAX   0x00001000
+#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  uint64_t step;      // Keep where we are
+
+  // Low latency mechanism
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+  uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+  struct ncclComm *comm;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclDevComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+  union {
+    struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
+      int id;
+      int nthreads;
+      int buffSize;
+
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+
+#define MAXCHANNELS 16
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile ncclDevError_t *fatalDevError;
+
+  // Channels, device side
+  struct ncclChannel* channels;
+};
+
+#endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 4db7094..3b7a18c 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,6 +10,12 @@
 #include "core.h"
 #include "group.h"
 
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_LL_MIN_NTHREADS 64
+
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4f3e831..0943f99 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/info.h b/src/include/info.h
new file mode 100644
index 0000000..401298a
--- /dev/null
+++ b/src/include/info.h
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
+#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 89edbf5..797c759 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -80,12 +80,13 @@ typedef struct {
   // Finalize connection establishment after remote peer has called connectHandle
   ncclResult_t (*accept)(void* listenComm, void** recvComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
diff --git a/src/include/net.h b/src/include/net.h
index e75e6bb..da3ecea 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
index 1baf9e5..8a0f99e 100644
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,6 +18,7 @@
 enum ncclNvLinkDeviceType {
   ncclNvLinkDeviceGpu,
   ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };
 
 static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
   memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
   char* rPath = realpath(classPath, NULL);
   int fd;
-  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
   free(rPath);
   char pciClass[9];
   strncpy(pciClass, "0x000000", 9);
@@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
   if (strcmp(pciClass, "0x068000") == 0) {
     // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
     *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
   } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
       || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
     *type = ncclNvLinkDeviceGpu;
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 0b6198a..f658279 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/param.h b/src/include/param.h
index dd5f697..5431757 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
     s++;
     strncpy(envValue, line+s, 1024);
     setenv(envVar, envValue, 0);
-    char *str = getenv(envVar);
   }
   if (line) free(line);
   fclose(file);
diff --git a/src/include/rings.h b/src/include/rings.h
index 43fc595..9701f84 100644
--- a/src/include/rings.h
+++ b/src/include/rings.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/shm.h b/src/include/shm.h
index 4fb49cb..9cd9d05 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/socket.h b/src/include/socket.h
index fb5cfc0..739c0c4 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,8 +18,9 @@
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT     1000  // sleep interval in usec
-#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 
 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #endif
 
   int ret;
-  int retries = 0;
+  int timedout_retries = 0;
+  int refused_retries = 0;
 retry:
   SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
   if (ret == 0) return ncclSuccess;
-  if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
-    INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
-    usleep(SLEEP_INT);
-    goto retry;
+  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
   }
   WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
   return ncclSystemError;
diff --git a/src/include/topo.h b/src/include/topo.h
index e824a81..69cd100 100644
--- a/src/include/topo.h
+++ b/src/include/topo.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,49 +11,35 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
+#include <stdio.h>
 
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+ncclResult_t getCudaPath(int cudaDev, char** path);
 
-static ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';
+
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
 }
 
 enum ncclPathDist {
-  PATH_PIX = 0,
-  PATH_PXB = 1,
-  PATH_PHB = 2,
-  PATH_SOC = 3
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
 };
 
-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+extern const char* pathDists[PATH_ARRAY_SIZE];
 
-static int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (score <= 3) return PATH_SOC;
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
+int pciDistance(char* path1, char* path2);
 
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 6231a71..91628f6 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -8,6 +8,7 @@
 #define NCCL_TRANSPORT_H_
 
 #include "nccl.h"
+#include "devcomm.h"
 #include <stdint.h>
 #include "nvmlwrap.h"
 
@@ -37,7 +38,7 @@ struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
 
 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
   }
 }
 
-inline void transportProxyIdle(int idle) {
-  sched_yield();
-}
-
 #endif
diff --git a/src/include/trees.h b/src/include/trees.h
index 1a151d1..7eadd85 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a6a588..29b72ad 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +10,7 @@
 #include "nccl.h"
 #include <stdint.h>
 
-ncclResult_t getHostName(char* hostname, int maxlen);
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getHostHash();
 uint64_t getPidHash();
 
diff --git a/src/init.cu b/src/init.cc
similarity index 93%
rename from src/init.cu
rename to src/init.cc
index 75822e6..80af287 100644
--- a/src/init.cu
+++ b/src/init.cc
@@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
 std::chrono::high_resolution_clock::time_point ncclEpoch;
 #endif
 
-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   return bootstrapGetUniqueId(out);
 }
 
+// Prevent compiler from optimizing out these operations
+void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
+  comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
+}
+
 static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
@@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
 
+  CUDACHECK(cudaFree(comm->hostDevComm.channels));
   CUDACHECK(cudaFree(comm->devComm));
 
   for (int channel=0; channel<comm->nChannels; channel++)
@@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
   CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
 
+  // Poison comm to try and catch a double free
+  commPoison(comm);
+
   free(comm);
   return ncclSuccess;
 }
@@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
 
-  comm->rank = rank;
-  comm->nRanks = ndev;
+  comm->rank = comm->hostDevComm.rank =rank;
+  comm->nRanks = comm->hostDevComm.nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
   getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
 
   comm->doneEvent = doneEvent;
   comm->llThreshold = ncclParamLlThreshold();
   comm->treeThreshold = ncclParamTreeThreshold();
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
   comm->groupCudaStream = ncclParamGroupCudaStream();
 #else
   // Don't allow the user to overload the default setting in older CUDA builds
@@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 #endif
   comm->fatalError = ncclSuccess;
 
-  CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
   *comm->fatalDevError = ncclDevSuccess;
 
-  CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
   *comm->abortFlag = 0;
 
   comm->argsptr = &comm->args;
@@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 }
 
 static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
-  // Copy the comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
-  // Copy userRanks
+  // Duplicate the channels on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+  // Copy userRanks and peers
   for (int r=0; r<comm->nChannels; r++) {
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
   }
-  // Copy the device-accessible pointer to comm->abortFlag
-  void *devAbortFlag;
-  CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
-  // Copy the device-accessible pointer to comm->fatalDevError
-  void *devFatalError;
-  CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
+
+  // Duplicate the dev comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
   return ncclSuccess;
 }
 
@@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
       }
     }
 
-    int ranks[nMasters];
+    int* ranks;
+    NCCLCHECK(ncclCalloc(&ranks, nMasters));
     int i = 0, masterIndex = -1;
     // Build binary tree
     for (int r=0; r<nranks; r++) {
@@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
       tree->up = prev;
       if (treeMasters[next] == 0) tree->down[0] = next;
     }
+    free(ranks);
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
     if (peer == -1) continue;
     conn = &channel->peers[peer].recv;
     if (conn->connected) { ++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
     NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
     NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
   }
@@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
     if (peer == -1) continue;
     conn = &channel->peers[peer].send;
     if (conn->connected) { ++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
     NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
     NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
   }
@@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
     if (peer == -1) continue;
     conn = &channel->peers[peer].send;
     if (conn->connected) {++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
     NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
     NCCLCHECK(conn->transportComm->connect(&connect, conn));
     conn->connected = 1;
@@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
     if (peer == -1) continue;
     conn = &channel->peers[peer].recv;
     if (conn->connected) {++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
     NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
     NCCLCHECK(conn->transportComm->connect(&connect, conn));
     conn->connected = 1;
@@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
   return ncclSuccess;
 }
 
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
 static ncclResult_t setCpuAffinity(int cudaDev) {
-  // Work within the enveloppe we were provided
+  // Query the CPU affinity set we were provided
   cpu_set_t mask;
   SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
 
-  // Find the subpart that is local to our GPU
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  // Find the CPUs that are local to the supplied GPU
   cpu_set_t gpuMask;
   NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-  cpu_set_t finalMask;
-  CPU_AND(&finalMask, &mask, &gpuMask);
 
-  // If those are not disjoint, try to stay local
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = gpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If there is a non empty set, use it to set affinity
   if (CPU_COUNT(&finalMask)) {
     char affinityStr[sizeof(cpu_set_t)*2];
     NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
@@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     comms[rank]->threadThreshold = threadThreshold;
   }
 
+  struct ncclConnect* connect;
+  NCCLCHECK(ncclCalloc(&connect, 2*nranks));
   for (int r=0; r<nrings; r++) {
-    struct ncclConnect connect[2*nranks];
     int* ringRanks = rings+r*nranks;
     for (int rank=0; rank<nranks; rank++) {
       CUDACHECK(cudaSetDevice(devs[rank]));
@@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
       NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
     }
   }
+  free(connect);
   free(allInfo);
   free(rings);
   free(treeIn);
@@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   int savedDevice;
   int rank, cudaDev;
   ncclComm_t comm = NULL;
-  int ncclDevList[ndev];
+  int* ncclDevList = NULL;
+  NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
   for (int i=0; i<ndev; i++) {
     ncclDevList[i] = devlist ? devlist[i] : i;
   }
 
-  cudaGetDevice(&savedDevice);
+  CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
 
   for(rank=0; rank<ndev; ++rank)
     comms[rank] = NULL;
@@ -1118,6 +1156,7 @@ cleanup:
   }
 
 final:
+  free(ncclDevList);
   if(wrapNvmlShutdown() != ncclSuccess)
     INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
   cudaSetDevice(savedDevice);
@@ -1128,9 +1167,11 @@ final:
 
 static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
+#ifdef ENABLE_TRACE
+  int rank = comm->rank;
+#endif
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
-  int rank = comm->rank;
 
   if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(commDevice));
@@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   if (savedDevice != commDevice)
     CUDACHECK(cudaSetDevice(savedDevice));
 
-  INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
 
   return ncclSuccess;
 }
@@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
+
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
   return commDestroy(comm);
 }
 
diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc
similarity index 96%
rename from src/misc/checks.cu
rename to src/misc/argcheck.cc
index a07e577..364f041 100644
--- a/src/misc/checks.cu
+++ b/src/misc/argcheck.cc
@@ -1,10 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "checks.h"
+#include "argcheck.h"
 
 static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
   cudaPointerAttributes attr;
diff --git a/src/misc/group.cu b/src/misc/group.cc
similarity index 98%
rename from src/misc/group.cu
rename to src/misc/group.cc
index c428a22..7bc64cd 100644
--- a/src/misc/group.cu
+++ b/src/misc/group.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
   int savedDev;
   CUDACHECK(cudaGetDevice(&savedDev));
   int done = ncclGroupIndex;
-  int doneArray[ncclGroupIndex];
+  int doneArray[MAX_ASYNC_OPS];
   for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
 
   ncclResult_t ret = ncclGroupError;
diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cc
similarity index 99%
rename from src/misc/ibvwrap.cu
rename to src/misc/ibvwrap.cc
index 7ac3431..f47c141 100644
--- a/src/misc/ibvwrap.cu
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cc
similarity index 99%
rename from src/misc/nvmlwrap.cu
rename to src/misc/nvmlwrap.cc
index 635f332..fbe481f 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/misc/rings.cu b/src/misc/rings.cc
similarity index 98%
rename from src/misc/rings.cu
rename to src/misc/rings.cc
index a7b122c..27ca9b6 100644
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
       NCCLCHECK(getEnvThreads(nthreads));
       for (int r = 0; r<*nrings; r++) {
         for (int i = 0; i<nranks; i++) {
-          if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
-          if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+          if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
+          if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
         }
       }
       return ncclSuccess;
diff --git a/src/misc/topo.cc b/src/misc/topo.cc
new file mode 100644
index 0000000..6364978
--- /dev/null
+++ b/src/misc/topo.cc
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "topo.h"
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[BUSID_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score <= 3) {
+    /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
+    int numaId1 = getNumaId(path1);
+    int numaId2 = getNumaId(path2);
+    TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
+    return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
+  }
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
diff --git a/src/misc/trees.cu b/src/misc/trees.cc
similarity index 98%
rename from src/misc/trees.cu
rename to src/misc/trees.cc
index e53ea0b..f672abe 100644
--- a/src/misc/trees.cu
+++ b/src/misc/trees.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/misc/utils.cu b/src/misc/utils.cc
similarity index 94%
rename from src/misc/utils.cu
rename to src/misc/utils.cc
index c618e71..5e884ae 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
   return ncclSuccess;
 }
 
-ncclResult_t getHostName(char* hostname, int maxlen) {
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
   if (gethostname(hostname, maxlen) != 0) {
     strncpy(hostname, "unknown", maxlen);
     return ncclSystemError;
   }
   int i = 0;
-  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
   hostname[i] = '\0';
   return ncclSuccess;
 }
@@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   if (ncclDebugLevel <= NCCL_LOG_NONE) return;
 
   char hostname[1024];
-  getHostName(hostname, 1024);
+  getHostName(hostname, 1024, '.');
   int cudaDev;
   cudaGetDevice(&cudaDev);
 
@@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
  */
 uint64_t getHostHash(void) {
   char uname[1024];
-  // Start off with the hostname
-  (void) getHostName(uname, sizeof(uname));
+  // Start off with the full hostname
+  (void) getHostName(uname, sizeof(uname), '\0');
   int offset = strlen(uname);
   int len;
   // $(readlink /proc/self/ns/uts)
diff --git a/src/transport.cu b/src/transport.cc
similarity index 100%
rename from src/transport.cu
rename to src/transport.cc
diff --git a/src/transport/net.cu b/src/transport/net.cc
similarity index 93%
rename from src/transport/net.cu
rename to src/transport/net.cc
index 06a6e23..823caf1 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cc
@@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
 static ncclTvalue_t getTvalue(short* distances, int ndev) {
   ncclTvalue_t tvalue = 0;
   for (int d=0; d<ndev; d++) {
-    int score = 1 + PATH_SOC - distances[d];
+    int score = 1 + PATH_SYS - distances[d];
     // Keep 3 bits of score info per dev
     tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
   }
@@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
   ncclResult_t err;
   NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
   err = ncclNetPciPath(dev, &nicPath);
-  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
   if (nicPath) free(nicPath);
   if (cudaPath) free(cudaPath);
   return ncclSuccess;
@@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
   return -1;
 }
 
-
 ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
   int nGroups = groups[nranks-1] + 1;
-  int cardUsed[NET_MAX_IFS*nGroups];
-  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
+  int *cardUsed, *starts, *ends;
+  NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
+  NCCLCHECK(ncclCalloc(&starts, nGroups));
+  NCCLCHECK(ncclCalloc(&ends, nGroups));
 
   for (int ring = 0; ring<*nringsRet; ring++) {
-    int starts[nGroups];
-    int ends[nGroups];
     for (int group = 0; group<nGroups; group++) {
       int nranksInGroup = 0;
       int nsubGroups = 0;
-      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
+      for (int rank=0; rank<nranks; rank++)
+        if (groups[rank] == group) {
           nranksInGroup++;
           nsubGroups = std::max(subgroups[rank], nsubGroups);
         }
@@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
       }
       if (starts[group] == -1 || ends[group] == -1) {
         *nringsRet = ring;
-        return ncclSuccess;
+        goto done;
       }
     }
     // Link groups together
@@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
       prev[ring*nranks+starts[nextGroup]] = ends[group];
     }
   }
+done:
+  free(cardUsed);
+  free(starts);
+  free(ends);
   return ncclSuccess;
 }
 
@@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
     if (args->head < args->end) {
       if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
         volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
         if (args->llMode) {
           int buffSlot = args->tail%NCCL_STEPS;
           int size = sizesFifo[buffSlot];
           if (size != -1) {
-            uint32_t flag = args->tail + 1;
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
             int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
             size = nFifoLines * sizeof(union ncclLLFifoLine);
             union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
               }
             }
           }
-        } else if (args->tail < resources->hostRecvMem->tail) {
+        } else if (args->tail < *recvTail) {
           struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
           int stepSize = args->channel->buffSize/NCCL_STEPS;
           // Send through network
@@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
     if (args->head == args->end) {
       resources->step = args->end;
       args->idle = 0;
-      args->state = ncclProxyOpDone;
+      args->state = ncclProxyOpNone;
     }
   }
-  if (args->state == ncclProxyOpDone) {
-    union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
-      resources->step += NCCL_STEPS;
-      resources->hostSendMem->head = resources->step;
-      resources->llLastCleaning = resources->step;
-    }
-    args->state = ncclProxyOpNone;
-  }
   return ncclSuccess;
 }
 
@@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
       struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
       char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
       void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+      volatile uint64_t* sendHead = &resources->hostSendMem->head;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
         int buffSlot = args->tail%NCCL_STEPS;
         int sliceSize = stepSize * args->sliceSteps;
         NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
     if (args->head == args->end) {
       resources->step = args->end;
       args->idle = 0;
-      args->state = ncclProxyOpDone;
+      args->state = ncclProxyOpNone;
     }
   }
-  if (args->state == ncclProxyOpDone) {
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->step += NCCL_STEPS;
-      while (resources->hostSendMem->head < resources->step);
-      resources->llLastCleaning = resources->step;
-    }
-    args->state = ncclProxyOpNone;
-  }
   return ncclSuccess;
 }
 
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc
similarity index 99%
rename from src/transport/net_ib.cu
rename to src/transport/net_ib.cc
index f7c574b..de72f89 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
         }
         int found = 0;
         struct ibv_device_attr devAttr;
+        memset(&devAttr, 0, sizeof(devAttr));
         if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
           WARN("NET/IB : Unable to query device %s", devices[d]->name);
           if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cc
similarity index 99%
rename from src/transport/net_socket.cu
rename to src/transport/net_socket.cc
index 0464b43..9958936 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cc
similarity index 94%
rename from src/transport/p2p.cu
rename to src/transport/p2p.cc
index 9f3e0b6..42b549e 100644
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cc
@@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
 /* Determine if we can communicate with the peer through p2p */
 ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_SOC;
+  int p2pLevel = PATH_NODE;
   if (ncclParamP2pDisable() == 1) p2pLevel = 0;
   if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
 
@@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
 
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-  if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+  if (peerCudaDev == -1) {
+    // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+    // But in CUDA 10.1 we can still communicate with 'invisible' devices
+    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
+    // Check for NVLink/NVswitch including P2P access
+    int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+    if (nvlinkp2p > 0) {
+      *ret = nvlinkp2p;
+      return ncclSuccess;
+    }
+#endif
+    return ncclSuccess;
+  }
 
   TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
 
   // Do not detect topology if we're on the same GPU. Note this is not really supported.
   if (myInfo->cudaDev == peerCudaDev) {
-    *ret = 1 + PATH_SOC;
+    *ret = 1 + PATH_SYS;
     return ncclSuccess;
   }
 
@@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
   if (err1 == ncclSuccess && err2 == ncclSuccess) {
     int distance = pciDistance(myPath, peerPath);
     if (distance < p2pLevel) {
-      *ret = 1 + PATH_SOC - distance;
+      *ret = 1 + PATH_SYS - distance;
     }
   }
   if (err1 == ncclSuccess) free(myPath);
@@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
   return ncclSuccess;
 }
 
+#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
+#define MAXGPUS_PCI 64
+
 static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
   int nrings = 0;
   ncclTvalue_t* line = matrix+current*n;
@@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
       }
     }
   } else {
-    int ringsSave[nRingsMax*n];
+    int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
     int maxStep = 0;
     for (int i=0; i<n; i++) {
       if (inTheRing[i] == 0 && line[i] > 0) {
@@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
 }
 
 static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
-  for (int score = PATH_SOC+1; score >= minScore; score--) {
+  for (int score = PATH_SYS+1; score >= minScore; score--) {
     int best = -1;
-    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+    int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
     for (int n = 0; n < nranks; n++) {
       if (inRing[n]) continue;
       if (values[rank*nranks+n] == score) {
@@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
     int start = findConnect(nranks, prev+r*nranks);
     int end = findConnect(nranks, next+r*nranks);
 
-    int inRing[nranks];
+    int inRing[MAXGPUS_PCI];
     for (int i=0; i<nranks; i++) inRing[i] = 0;
 
     if (start == -1 && end == -1) {
@@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
       links += val/CONNECT_NVLINK;
     }
     if (rank == 0) directLinks = links;
-    else directLinks =  std::min(directLinks, links);
+    else directLinks = std::min(directLinks, links);
   }
   if (directLinks > 0) {
     // NVLink : Connect rings or create new ones
+    if (nranks > MAXGPUS_NVLINKP2P) {
+      WARN("Recursive P2P computation cannot work for >8 GPUs");
+      return ncclInternalError;
+    }
     nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
     goto end;
   }
@@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
   if (sendRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
   CUDACHECK(cudaFree(sendRes->devMem));
+  free(sendRes);
   return ncclSuccess;
 }
 
@@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
   if (recvRes->ipcPtr)
     CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
   CUDACHECK(cudaFree(recvRes->devMem));
+  free(recvRes);
   return ncclSuccess;
 }
 
diff --git a/src/transport/shm.cu b/src/transport/shm.cc
similarity index 98%
rename from src/transport/shm.cu
rename to src/transport/shm.cc
index 83cc9d1..2ec5f23 100644
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
   return -1;
 }
 
+#define MAXGROUPS 16
+
 ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
   if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
   int nGroups = groups[nranks-1] + 1;
-  int starts[nGroups];
-  int ends[nGroups];
+  int starts[MAXGROUPS];
+  int ends[MAXGROUPS];
   for (int ring = 0; ring<*nringsRet; ring++) {
     int startGroup = -1, endGroup = -1;
     for (int group = 0; group<nGroups; group++) {