зеркало из https://github.com/microsoft/msccl.git
NCCL 2.4.6-1
Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc
This commit is contained in:
Родитель
14e0cf644b
Коммит
f40ce73e89
|
@ -1,5 +1,5 @@
|
|||
|
||||
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
|
|||
|
||||
## Copyright
|
||||
|
||||
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
@ -15,6 +15,7 @@ PROFAPI ?= 0
|
|||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||
CUDA_INC ?= $(CUDA_HOME)/include
|
||||
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
|
@ -43,7 +44,8 @@ endif
|
|||
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||
|
||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS += -Wall -Wno-sign-compare
|
||||
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
|
||||
CXXFLAGS += -I $(CUDA_INC)
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Use addprefix so that we can specify more than one path
|
||||
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||
|
@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
|
|||
endif
|
||||
|
||||
ifneq ($(VERBOSE), 0)
|
||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
|
||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
|
||||
CXXFLAGS += -Wall -Wextra
|
||||
else
|
||||
.SILENT:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 4
|
||||
NCCL_PATCH := 2
|
||||
NCCL_PATCH := 6
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
20
src/Makefile
20
src/Makefile
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
@ -9,10 +9,10 @@ include ../makefiles/version.mk
|
|||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
|
||||
misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
|
||||
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
|
||||
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
|
||||
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
|
||||
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
|
||||
|
||||
##### lib files
|
||||
LIBNAME := libnccl.so
|
||||
|
@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
|
|||
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
|
||||
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
|
||||
STATICLIBTARGET := $(STATICLIBNAME)
|
||||
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
|
||||
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
|
||||
DEPFILES := $(LIBOBJ:%.o=%.d)
|
||||
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
|
||||
|
||||
|
@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
|
|||
mkdir -p $(INCDIR)
|
||||
cp -f $< $@
|
||||
|
||||
$(OBJDIR)/%.o : %.cu
|
||||
$(OBJDIR)/%.o : %.cc
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
|
||||
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
|
||||
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
|
||||
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
|
||||
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
|
||||
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
|
||||
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
|
||||
|
@ -107,7 +107,7 @@ install : lib
|
|||
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
|
||||
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
|
||||
|
||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
||||
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
|
||||
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
|
||||
# as the BUILDDIR variable.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
|||
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
|
||||
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
|
||||
}
|
||||
|
||||
// Free the peer structures.
|
||||
CUDACHECK(cudaFree(channel->devPeers));
|
||||
free(channel->peers);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
|
@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
|
@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
|
@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
|
@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -8,7 +8,7 @@
|
|||
#define NCCL_DEVICE_COMMON_H_
|
||||
|
||||
#include "../collectives.h"
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "nccl.h"
|
||||
|
||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||
|
@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
|||
int bid = blockIdx.x; \
|
||||
__shared__ struct ncclColl localColl; \
|
||||
\
|
||||
struct ncclComm* comm = firstColl.args.comm; \
|
||||
struct ncclDevComm* comm = firstColl.args.comm; \
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
struct ncclColl* c; \
|
||||
if (bid == 0) { \
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -7,7 +7,7 @@
|
|||
#ifndef NCCL_COMMON_KERNEL_H_
|
||||
#define NCCL_COMMON_KERNEL_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
|
|
@ -50,7 +50,7 @@ class ncclPrimitives {
|
|||
T* sendDirectBuff[NSEND];
|
||||
const T* recvBuff[NRECV];
|
||||
T* sendBuff[NSEND];
|
||||
struct ncclComm* comm;
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
|
@ -239,7 +239,7 @@ class ncclPrimitives {
|
|||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
|
||||
// Make sure step is updated before we read it
|
||||
__syncthreads();
|
||||
|
@ -329,14 +329,14 @@ class ncclLLPrimitives {
|
|||
uint64_t sendConnHead;
|
||||
union ncclLLFifoLine* recvBuff[NRECV];
|
||||
union ncclLLFifoLine* sendBuff[NSEND];
|
||||
struct ncclComm* comm;
|
||||
struct ncclDevComm* comm;
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
|
||||
inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
|
||||
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
// Exit If Abort Barrier : make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if val == 1
|
||||
|
@ -393,7 +393,10 @@ class ncclLLPrimitives {
|
|||
sendConnHead = *waitPtr;
|
||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||
}
|
||||
if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
|
||||
if (fifoPtr) {
|
||||
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
fifoPtr[sendStep[i]%NCCL_STEPS] = size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -402,7 +405,12 @@ class ncclLLPrimitives {
|
|||
if (tid == i) *postPtr = recvStep[i];
|
||||
}
|
||||
|
||||
inline __device__ void postSend(int i) {
|
||||
inline __device__ void postSend(int i, int offset) {
|
||||
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||
// data corruption when flag loops over.
|
||||
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
}
|
||||
sendStep[i]++;
|
||||
}
|
||||
|
||||
|
@ -443,9 +451,10 @@ class ncclLLPrimitives {
|
|||
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
||||
uint64_t* srcPack = (uint64_t*)srcPtr;
|
||||
uint64_t* dstPack = (uint64_t*)dstPtr;
|
||||
int offset = tid;
|
||||
// Do multiples of 64 bits
|
||||
#pragma unroll 2
|
||||
for (int offset=tid; offset<npack; offset+=nthreads) {
|
||||
for (; offset<npack; offset+=nthreads) {
|
||||
// Recv : local, then intra-node, then inter-node
|
||||
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
||||
if (RECV) {
|
||||
|
@ -471,7 +480,7 @@ class ncclLLPrimitives {
|
|||
}
|
||||
exitIfAbortLocalBarrier();
|
||||
FOR_RECV(postRecv);
|
||||
FOR_SEND(postSend);
|
||||
FOR_SEND(postSend, offset);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
|
@ -514,32 +523,9 @@ class ncclLLPrimitives {
|
|||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void llSendCleaning(int i) {
|
||||
if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
||||
/* Reset all flags */
|
||||
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
|
||||
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
|
||||
for (int s=0; s<NCCL_STEPS; s++) {
|
||||
waitSend(i, 0);
|
||||
for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
|
||||
const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
|
||||
sendPtr(i)[o].i4 = resetLine.i4;
|
||||
}
|
||||
}
|
||||
if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void llRecvCleaning(int i) {
|
||||
if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
||||
recvStep[i] += NCCL_STEPS;
|
||||
if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
|
||||
// Make sure step is updated before we read it.
|
||||
barrier();
|
||||
|
@ -577,8 +563,6 @@ class ncclLLPrimitives {
|
|||
}
|
||||
|
||||
__device__ __forceinline__ ~ncclLLPrimitives() {
|
||||
for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
|
||||
// Save steps for the next operation
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
|
@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
|
@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x - 1;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
|
@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
|||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
|
|||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
|
||||
params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
|
||||
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
||||
|
||||
// Set active = 2 for the last operation
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
|
@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
|||
|
||||
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
|
||||
// Compute thresholds and limits that users can override
|
||||
int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
|
||||
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
|
||||
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
|
||||
|
||||
// First compute nThreads
|
||||
|
@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
|||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||
|
||||
info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
||||
if (info->comm->userStreamSet == false) {
|
||||
info->comm->userStream = info->stream;
|
||||
info->comm->userStreamSet = true;
|
|
@ -0,0 +1,51 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ALLOC_H_
|
||||
#define NCCL_ALLOC_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
return ncclSystemError;
|
||||
}
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,15 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ARGCHECK_H_
|
||||
#define NCCL_ARGCHECK_H_
|
||||
|
||||
#include "core.h"
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
|
||||
#endif
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,10 +1,73 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#ifndef NCCL_CHECKS_H_
|
||||
#define NCCL_CHECKS_H_
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
#include "debug.h"
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_COMM_H_
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
void *func;
|
||||
dim3 gridDim;
|
||||
dim3 blockDim;
|
||||
void **args;
|
||||
size_t sharedMem;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#define MEM_ALIGN 4096
|
||||
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
|
||||
|
||||
struct ncclSendMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t head;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
void* ptrExchange;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
||||
uint64_t opCount;
|
||||
};
|
||||
char pad3[MEM_ALIGN];
|
||||
};
|
||||
};
|
||||
|
||||
struct ncclRecvMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
uint64_t opCount;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
|
||||
void* bootstrap;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
int nvmlDev; // my NVML device number
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
cudaStream_t userStream;
|
||||
bool userStreamSet;
|
||||
cudaEvent_t doneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
uint64_t opCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
|
||||
// Low-latency algorithm threshold
|
||||
ssize_t llThreshold;
|
||||
ssize_t threadThreshold;
|
||||
|
||||
// Tree algorithm threshold
|
||||
ssize_t treeThreshold;
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
cudaStream_t groupStream;
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
|
||||
// Error reported by GPU
|
||||
volatile ncclDevError_t* fatalDevError;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// Device side of the communicator
|
||||
struct ncclDevComm *devComm;
|
||||
// Host copy of the devComm (to free CUDA allocs)
|
||||
struct ncclDevComm hostDevComm;
|
||||
|
||||
// Intra-process sync
|
||||
int intraRank;
|
||||
int intraRanks;
|
||||
int* intraBarrier;
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
struct cudaLaunchParams * intraParams;
|
||||
struct cudaLaunchParams *myParams;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclColl args;
|
||||
void* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
struct ncclProxyState proxyState;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -7,385 +7,20 @@
|
|||
#ifndef NCCL_CORE_H_
|
||||
#define NCCL_CORE_H_
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
#include <pthread.h>
|
||||
#include <algorithm>
|
||||
#include "nccl.h"
|
||||
#include "transport.h"
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
#include "transport.h"
|
||||
#include "devcomm.h"
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "argcheck.h"
|
||||
#include <cstdio>
|
||||
#include <algorithm> // std::min/std::max
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
void *func;
|
||||
dim3 gridDim;
|
||||
dim3 blockDim;
|
||||
void **args;
|
||||
size_t sharedMem;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
#define MAXTHREADS 256
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
||||
#define NCCL_LL_MIN_NTHREADS 64
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
union ncclLLFifoLine {
|
||||
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
||||
from the network may receive the flag but not the data.
|
||||
Note this is assuming that either we receive contiguous chunks of data
|
||||
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
||||
struct {
|
||||
uint32_t data1;
|
||||
uint32_t flag1;
|
||||
uint32_t data2;
|
||||
uint32_t flag2;
|
||||
};
|
||||
uint64_t v[2];
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||
|
||||
typedef enum {
|
||||
ncclPatternRing,
|
||||
ncclPatternRingTwice,
|
||||
ncclPatternPipelineFrom,
|
||||
ncclPatternPipelineTo,
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown
|
||||
} ncclPattern_t;
|
||||
|
||||
typedef enum {
|
||||
ncclDevSuccess,
|
||||
ncclDevAssertedMismatch,
|
||||
ncclDevSuspectedMismatch
|
||||
} ncclDevError_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclColl_t coll;
|
||||
const char* opName;
|
||||
// NCCL Coll Args
|
||||
const void* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t op;
|
||||
int root;
|
||||
ncclComm_t comm;
|
||||
cudaStream_t stream;
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclPattern_t pattern;
|
||||
size_t nBytes;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
};
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCountLoc; // opCount of local rank
|
||||
uint64_t *opCountRem; // opCount of remote rank
|
||||
|
||||
int direct; // Direct communication
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
|
||||
// Low latency mechanism
|
||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||
uint64_t llLastCleaning;
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources; // Host-side resources
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#define MEM_ALIGN 4096
|
||||
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
|
||||
|
||||
#define NUM_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||
#define NCCL_LL_CLEAN_FREQ 0x10000000
|
||||
|
||||
struct ncclSendMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t head;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
void* ptrExchange;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
||||
uint64_t opCount;
|
||||
};
|
||||
char pad3[MEM_ALIGN];
|
||||
};
|
||||
};
|
||||
|
||||
struct ncclRecvMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
uint64_t opCount;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
// Shortcuts for userRanks[1] and userRanks[n-1]
|
||||
int prev;
|
||||
int next;
|
||||
|
||||
// Maps an internal nccl index to user-specified rank order. This is necessary
|
||||
// since we need to know how the user expects data to be ordered across
|
||||
// devices. Ordered from current device.
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
};
|
||||
|
||||
#define NCCL_MAX_TREE_ARITY 3
|
||||
struct ncclTree {
|
||||
int depth;
|
||||
int up;
|
||||
int down[NCCL_MAX_TREE_ARITY];
|
||||
};
|
||||
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
};
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
|
||||
int id;
|
||||
int nthreads;
|
||||
int buffSize;
|
||||
|
||||
// Communication structures
|
||||
struct ncclPeer* peers;
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclComm* comm;
|
||||
uint64_t opCount;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * ThisInput;
|
||||
void * ThisOutput;
|
||||
|
||||
// general parameters
|
||||
size_t N;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint16_t nThreads;
|
||||
|
||||
int lastChunkSize;
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
|
||||
void* bootstrap;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
int nvmlDev; // my NVML device number
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
cudaStream_t userStream;
|
||||
bool userStreamSet;
|
||||
cudaEvent_t doneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
uint64_t opCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
|
||||
// Low-latency algorithm threshold
|
||||
ssize_t llThreshold;
|
||||
ssize_t threadThreshold;
|
||||
|
||||
// Tree algorithm threshold
|
||||
ssize_t treeThreshold;
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
cudaStream_t groupStream;
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
|
||||
// Error reported by GPU
|
||||
volatile ncclDevError_t* fatalDevError;
|
||||
|
||||
// On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
|
||||
// On device: this pointer has been obtained from cudaHostGetDevicePointer()
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// Device copy of the communicator
|
||||
struct ncclComm *devComm;
|
||||
|
||||
// Intra-process sync
|
||||
int intraRank;
|
||||
int intraRanks;
|
||||
int* intraBarrier;
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
struct cudaLaunchParams * intraParams;
|
||||
struct cudaLaunchParams *myParams;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclColl args;
|
||||
void* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
struct ncclProxyState proxyState;
|
||||
};
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#ifdef PROFAPI
|
||||
#define NCCL_API(ret, func, args...) \
|
||||
|
@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
|||
}
|
||||
}
|
||||
|
||||
#include <sys/mman.h>
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
return ncclSystemError;
|
||||
}
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif // end include guard
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -24,7 +24,7 @@ extern int ncclDebugLevel;
|
|||
extern uint64_t ncclDebugMask;
|
||||
extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen);
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
|
||||
|
||||
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
|
@ -108,7 +108,7 @@ static inline void initDebug() {
|
|||
break;
|
||||
case 'h': // %h = hostname
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
getHostName(hostname, 1024, '.');
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_DEVICE_H_
|
||||
#define NCCL_DEVICE_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
union ncclLLFifoLine {
|
||||
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
||||
from the network may receive the flag but not the data.
|
||||
Note this is assuming that either we receive contiguous chunks of data
|
||||
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
||||
struct {
|
||||
uint32_t data1;
|
||||
uint32_t flag1;
|
||||
uint32_t data2;
|
||||
uint32_t flag2;
|
||||
};
|
||||
uint64_t v[2];
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
#define MAXTHREADS 256
|
||||
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
||||
#define NUM_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||
#ifdef DEBUG_LL
|
||||
#define NCCL_LL_CLEAN_MASK 0x00000ff8
|
||||
#define NCCL_LL_FLAG_MAX 0x00001000
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
|
||||
#else
|
||||
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
|
||||
#endif
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCountLoc; // opCount of local rank
|
||||
uint64_t *opCountRem; // opCount of remote rank
|
||||
|
||||
int direct; // Direct communication
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
|
||||
// Low latency mechanism
|
||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||
uint64_t llLastCleaning;
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources; // Host-side resources
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
// Shortcuts for userRanks[1] and userRanks[n-1]
|
||||
int prev;
|
||||
int next;
|
||||
|
||||
// Maps an internal nccl index to user-specified rank order. This is necessary
|
||||
// since we need to know how the user expects data to be ordered across
|
||||
// devices. Ordered from current device.
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
};
|
||||
|
||||
|
||||
#define NCCL_MAX_TREE_ARITY 3
|
||||
struct ncclTree {
|
||||
int depth;
|
||||
int up;
|
||||
int down[NCCL_MAX_TREE_ARITY];
|
||||
};
|
||||
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclDevComm* comm;
|
||||
uint64_t opCount;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * ThisInput;
|
||||
void * ThisOutput;
|
||||
|
||||
// general parameters
|
||||
size_t N;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint16_t nThreads;
|
||||
|
||||
int lastChunkSize;
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
|
||||
int id;
|
||||
int nthreads;
|
||||
int buffSize;
|
||||
|
||||
// Communication structures
|
||||
struct ncclPeer* peers;
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
|
||||
typedef enum {
|
||||
ncclDevSuccess,
|
||||
ncclDevAssertedMismatch,
|
||||
ncclDevSuspectedMismatch
|
||||
} ncclDevError_t;
|
||||
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile ncclDevError_t *fatalDevError;
|
||||
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -10,6 +10,12 @@
|
|||
#include "core.h"
|
||||
#include "group.h"
|
||||
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_LL_MIN_NTHREADS 64
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_INFO_H_
|
||||
#define NCCL_INFO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
typedef enum {
|
||||
ncclPatternRing,
|
||||
ncclPatternRingTwice,
|
||||
ncclPatternPipelineFrom,
|
||||
ncclPatternPipelineTo,
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclColl_t coll;
|
||||
const char* opName;
|
||||
// NCCL Coll Args
|
||||
const void* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t op;
|
||||
int root;
|
||||
ncclComm_t comm;
|
||||
cudaStream_t stream;
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclPattern_t pattern;
|
||||
size_t nBytes;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -80,12 +80,13 @@ typedef struct {
|
|||
// Finalize connection establishment after remote peer has called connectHandle
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -18,6 +18,7 @@
|
|||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
||||
|
@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
|||
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
||||
char* rPath = realpath(classPath, NULL);
|
||||
int fd;
|
||||
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
|
||||
if ((fd = open(rPath, O_RDONLY)) == -1) {
|
||||
// Could not find device. It might be because we're in a VM and
|
||||
// we don't see the whole machine. This is handled silently so
|
||||
// we don't want to print an INFO error.
|
||||
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
free(rPath);
|
||||
char pciClass[9];
|
||||
strncpy(pciClass, "0x000000", 9);
|
||||
|
@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
|||
if (strcmp(pciClass, "0x068000") == 0) {
|
||||
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
||||
*type = ncclNvLinkDeviceSwitch;
|
||||
} else if (strcmp(pciClass, "0x068001") == 0) {
|
||||
// PCI device is of type "Bridge: IBM Device 04ea"
|
||||
*type = ncclNvLinkDeviceBridge;
|
||||
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
||||
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
||||
*type = ncclNvLinkDeviceGpu;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
|
|||
s++;
|
||||
strncpy(envValue, line+s, 1024);
|
||||
setenv(envVar, envValue, 0);
|
||||
char *str = getenv(envVar);
|
||||
}
|
||||
if (line) free(line);
|
||||
fclose(file);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -18,8 +18,9 @@
|
|||
|
||||
#define MAX_IFS 16
|
||||
#define MAX_IF_NAME_SIZE 16
|
||||
#define SLEEP_INT 1000 // sleep interval in usec
|
||||
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
|
||||
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||
|
||||
/* Common socket address storage structure for IPv4/IPv6 */
|
||||
union socketAddress {
|
||||
|
@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
|||
#endif
|
||||
|
||||
int ret;
|
||||
int retries = 0;
|
||||
int timedout_retries = 0;
|
||||
int refused_retries = 0;
|
||||
retry:
|
||||
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
|
||||
if (ret == 0) return ncclSuccess;
|
||||
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
|
||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
|
||||
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -11,49 +11,35 @@
|
|||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
ncclResult_t getCudaPath(int cudaDev, char** path);
|
||||
|
||||
static ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||
char busId[BUSID_SIZE];
|
||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||
*path = realpath(busPath, NULL);
|
||||
if (*path == NULL) {
|
||||
WARN("Could not find real path of %s", busPath);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
static int getNumaId(char *path) {
|
||||
char npath[PATH_MAX];
|
||||
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||
npath[PATH_MAX-1] = '\0';
|
||||
|
||||
int numaId = -1;
|
||||
FILE *file = fopen(npath, "r");
|
||||
if (file == NULL) return -1;
|
||||
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
|
||||
fclose(file);
|
||||
|
||||
return numaId;
|
||||
}
|
||||
|
||||
enum ncclPathDist {
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_SOC = 3
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_NODE = 3,
|
||||
PATH_SYS = 4,
|
||||
PATH_ARRAY_SIZE = 5
|
||||
};
|
||||
|
||||
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
|
||||
extern const char* pathDists[PATH_ARRAY_SIZE];
|
||||
|
||||
static int pciDistance(char* path1, char* path2) {
|
||||
int score = 0;
|
||||
int depth = 0;
|
||||
int same = 1;
|
||||
for (int i=0; i<strlen(path1); i++) {
|
||||
if (path1[i] != path2[i]) same = 0;
|
||||
if (path1[i] == '/') {
|
||||
depth++;
|
||||
if (same == 1) score++;
|
||||
}
|
||||
}
|
||||
if (score <= 3) return PATH_SOC;
|
||||
if (score == 4) return PATH_PHB;
|
||||
if (score == depth-1) return PATH_PIX;
|
||||
return PATH_PXB;
|
||||
}
|
||||
int pciDistance(char* path1, char* path2);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#define NCCL_TRANSPORT_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "devcomm.h"
|
||||
#include <stdint.h>
|
||||
#include "nvmlwrap.h"
|
||||
|
||||
|
@ -37,7 +38,7 @@ struct ncclConnect {
|
|||
char data[CONNECT_SIZE];
|
||||
};
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
|
|||
}
|
||||
}
|
||||
|
||||
inline void transportProxyIdle(int idle) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -10,7 +10,7 @@
|
|||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen);
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
|
|||
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 9200
|
||||
#if CUDART_VERSION >= 9020
|
||||
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
||||
#else
|
||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||
|
@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
|||
return bootstrapGetUniqueId(out);
|
||||
}
|
||||
|
||||
// Prevent compiler from optimizing out these operations
|
||||
void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
|
||||
comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
|
||||
}
|
||||
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||
if (comm->bootstrap)
|
||||
NCCLCHECK(bootstrapClose(comm->bootstrap));
|
||||
|
||||
CUDACHECK(cudaFree(comm->hostDevComm.channels));
|
||||
CUDACHECK(cudaFree(comm->devComm));
|
||||
|
||||
for (int channel=0; channel<comm->nChannels; channel++)
|
||||
|
@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
|||
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
|
||||
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
|
||||
|
||||
// Poison comm to try and catch a double free
|
||||
commPoison(comm);
|
||||
|
||||
free(comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||
struct ncclComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
|
||||
comm->rank = rank;
|
||||
comm->nRanks = ndev;
|
||||
comm->rank = comm->hostDevComm.rank =rank;
|
||||
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||
cudaGetDevice(&comm->cudaDev);
|
||||
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
|
||||
|
||||
comm->doneEvent = doneEvent;
|
||||
comm->llThreshold = ncclParamLlThreshold();
|
||||
comm->treeThreshold = ncclParamTreeThreshold();
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
#if CUDART_VERSION >= 9200
|
||||
#if CUDART_VERSION >= 9020
|
||||
comm->groupCudaStream = ncclParamGroupCudaStream();
|
||||
#else
|
||||
// Don't allow the user to overload the default setting in older CUDA builds
|
||||
|
@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||
#endif
|
||||
comm->fatalError = ncclSuccess;
|
||||
|
||||
CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
|
||||
*comm->fatalDevError = ncclDevSuccess;
|
||||
|
||||
CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
|
||||
*comm->abortFlag = 0;
|
||||
|
||||
comm->argsptr = &comm->args;
|
||||
|
@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
|||
}
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Fully duplicate the comm on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
||||
// Copy the comm on the device
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
|
||||
// Copy userRanks
|
||||
// Duplicate the channels on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
|
||||
|
||||
// Copy userRanks and peers
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
|
||||
}
|
||||
// Copy the device-accessible pointer to comm->abortFlag
|
||||
void *devAbortFlag;
|
||||
CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
|
||||
CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
|
||||
// Copy the device-accessible pointer to comm->fatalDevError
|
||||
void *devFatalError;
|
||||
CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
|
||||
CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
|
||||
|
||||
// Duplicate the dev comm on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
|||
}
|
||||
}
|
||||
|
||||
int ranks[nMasters];
|
||||
int* ranks;
|
||||
NCCLCHECK(ncclCalloc(&ranks, nMasters));
|
||||
int i = 0, masterIndex = -1;
|
||||
// Build binary tree
|
||||
for (int r=0; r<nranks; r++) {
|
||||
|
@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
|||
tree->up = prev;
|
||||
if (treeMasters[next] == 0) tree->down[0] = next;
|
||||
}
|
||||
free(ranks);
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) { ++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
|
@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) { ++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
|
||||
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
}
|
||||
|
@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].send;
|
||||
if (conn->connected) {++nSkippedSend; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||
conn->connected = 1;
|
||||
|
@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
|
|||
if (peer == -1) continue;
|
||||
conn = &channel->peers[peer].recv;
|
||||
if (conn->connected) {++nSkippedRecv; continue; }
|
||||
memset(&connect, 0, sizeof(connect));
|
||||
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
|
||||
NCCLCHECK(conn->transportComm->connect(&connect, conn));
|
||||
conn->connected = 1;
|
||||
|
@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
|
|||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
|
||||
|
||||
static ncclResult_t setCpuAffinity(int cudaDev) {
|
||||
// Work within the enveloppe we were provided
|
||||
// Query the CPU affinity set we were provided
|
||||
cpu_set_t mask;
|
||||
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
|
||||
|
||||
// Find the subpart that is local to our GPU
|
||||
#ifdef ENABLE_TRACE
|
||||
{
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
|
||||
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Find the CPUs that are local to the supplied GPU
|
||||
cpu_set_t gpuMask;
|
||||
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
|
||||
cpu_set_t finalMask;
|
||||
CPU_AND(&finalMask, &mask, &gpuMask);
|
||||
|
||||
// If those are not disjoint, try to stay local
|
||||
#ifdef ENABLE_TRACE
|
||||
{
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
|
||||
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
|
||||
}
|
||||
#endif
|
||||
|
||||
cpu_set_t finalMask;
|
||||
if (ncclParamIgnoreCpuAffinity())
|
||||
// Ignore the CPU affinity set and use the GPU one instead
|
||||
finalMask = gpuMask;
|
||||
else
|
||||
// Use a subset of the GPU affinity set
|
||||
CPU_AND(&finalMask, &mask, &gpuMask);
|
||||
|
||||
// If there is a non empty set, use it to set affinity
|
||||
if (CPU_COUNT(&finalMask)) {
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
||||
|
@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
|||
comms[rank]->threadThreshold = threadThreshold;
|
||||
}
|
||||
|
||||
struct ncclConnect* connect;
|
||||
NCCLCHECK(ncclCalloc(&connect, 2*nranks));
|
||||
for (int r=0; r<nrings; r++) {
|
||||
struct ncclConnect connect[2*nranks];
|
||||
int* ringRanks = rings+r*nranks;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(cudaSetDevice(devs[rank]));
|
||||
|
@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
|
|||
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
|
||||
}
|
||||
}
|
||||
free(connect);
|
||||
free(allInfo);
|
||||
free(rings);
|
||||
free(treeIn);
|
||||
|
@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
|||
int savedDevice;
|
||||
int rank, cudaDev;
|
||||
ncclComm_t comm = NULL;
|
||||
int ncclDevList[ndev];
|
||||
int* ncclDevList = NULL;
|
||||
NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
|
||||
for (int i=0; i<ndev; i++) {
|
||||
ncclDevList[i] = devlist ? devlist[i] : i;
|
||||
}
|
||||
|
||||
cudaGetDevice(&savedDevice);
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
|
||||
|
||||
for(rank=0; rank<ndev; ++rank)
|
||||
comms[rank] = NULL;
|
||||
|
@ -1118,6 +1156,7 @@ cleanup:
|
|||
}
|
||||
|
||||
final:
|
||||
free(ncclDevList);
|
||||
if(wrapNvmlShutdown() != ncclSuccess)
|
||||
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
||||
cudaSetDevice(savedDevice);
|
||||
|
@ -1128,9 +1167,11 @@ final:
|
|||
|
||||
static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
int savedDevice;
|
||||
#ifdef ENABLE_TRACE
|
||||
int rank = comm->rank;
|
||||
#endif
|
||||
CUDACHECK(cudaGetDevice(&savedDevice));
|
||||
int commDevice = comm->cudaDev;
|
||||
int rank = comm->rank;
|
||||
|
||||
if (savedDevice != commDevice) {
|
||||
CUDACHECK(cudaSetDevice(commDevice));
|
||||
|
@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
|||
if (savedDevice != commDevice)
|
||||
CUDACHECK(cudaSetDevice(savedDevice));
|
||||
|
||||
INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
||||
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
|||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
|
||||
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
|
||||
WARN("comm %p has already been destroyed", comm);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
return commDestroy(comm);
|
||||
}
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "checks.h"
|
||||
#include "argcheck.h"
|
||||
|
||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
cudaPointerAttributes attr;
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
|
|||
int savedDev;
|
||||
CUDACHECK(cudaGetDevice(&savedDev));
|
||||
int done = ncclGroupIndex;
|
||||
int doneArray[ncclGroupIndex];
|
||||
int doneArray[MAX_ASYNC_OPS];
|
||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
|
||||
|
||||
ncclResult_t ret = ncclGroupError;
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
|
|||
NCCLCHECK(getEnvThreads(nthreads));
|
||||
for (int r = 0; r<*nrings; r++) {
|
||||
for (int i = 0; i<nranks; i++) {
|
||||
if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
|
||||
if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
|
||||
if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
|
||||
if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
|
@ -0,0 +1,51 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
|
||||
ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||
char busId[BUSID_SIZE];
|
||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||
*path = realpath(busPath, NULL);
|
||||
if (*path == NULL) {
|
||||
WARN("Could not find real path of %s", busPath);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
|
||||
|
||||
int pciDistance(char* path1, char* path2) {
|
||||
int score = 0;
|
||||
int depth = 0;
|
||||
int same = 1;
|
||||
for (int i=0; i<strlen(path1); i++) {
|
||||
if (path1[i] != path2[i]) same = 0;
|
||||
if (path1[i] == '/') {
|
||||
depth++;
|
||||
if (same == 1) score++;
|
||||
}
|
||||
}
|
||||
if (score <= 3) {
|
||||
/* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
|
||||
int numaId1 = getNumaId(path1);
|
||||
int numaId2 = getNumaId(path2);
|
||||
TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
|
||||
return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
|
||||
}
|
||||
if (score == 4) return PATH_PHB;
|
||||
if (score == depth-1) return PATH_PIX;
|
||||
return PATH_PXB;
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
|
|||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen) {
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||
if (gethostname(hostname, maxlen) != 0) {
|
||||
strncpy(hostname, "unknown", maxlen);
|
||||
return ncclSystemError;
|
||||
}
|
||||
int i = 0;
|
||||
while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
|
||||
while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
|
||||
hostname[i] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
|||
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
|
||||
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
getHostName(hostname, 1024, '.');
|
||||
int cudaDev;
|
||||
cudaGetDevice(&cudaDev);
|
||||
|
||||
|
@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
|
|||
*/
|
||||
uint64_t getHostHash(void) {
|
||||
char uname[1024];
|
||||
// Start off with the hostname
|
||||
(void) getHostName(uname, sizeof(uname));
|
||||
// Start off with the full hostname
|
||||
(void) getHostName(uname, sizeof(uname), '\0');
|
||||
int offset = strlen(uname);
|
||||
int len;
|
||||
// $(readlink /proc/self/ns/uts)
|
|
@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
|
|||
static ncclTvalue_t getTvalue(short* distances, int ndev) {
|
||||
ncclTvalue_t tvalue = 0;
|
||||
for (int d=0; d<ndev; d++) {
|
||||
int score = 1 + PATH_SOC - distances[d];
|
||||
int score = 1 + PATH_SYS - distances[d];
|
||||
// Keep 3 bits of score info per dev
|
||||
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
|
||||
}
|
||||
|
@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
|
|||
ncclResult_t err;
|
||||
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
||||
err = ncclNetPciPath(dev, &nicPath);
|
||||
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
|
||||
*distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
|
||||
if (nicPath) free(nicPath);
|
||||
if (cudaPath) free(cudaPath);
|
||||
return ncclSuccess;
|
||||
|
@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
|
|||
return -1;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||
int nGroups = groups[nranks-1] + 1;
|
||||
int cardUsed[NET_MAX_IFS*nGroups];
|
||||
for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
|
||||
int *cardUsed, *starts, *ends;
|
||||
NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
|
||||
NCCLCHECK(ncclCalloc(&starts, nGroups));
|
||||
NCCLCHECK(ncclCalloc(&ends, nGroups));
|
||||
|
||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||
int starts[nGroups];
|
||||
int ends[nGroups];
|
||||
for (int group = 0; group<nGroups; group++) {
|
||||
int nranksInGroup = 0;
|
||||
int nsubGroups = 0;
|
||||
for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
|
||||
for (int rank=0; rank<nranks; rank++)
|
||||
if (groups[rank] == group) {
|
||||
nranksInGroup++;
|
||||
nsubGroups = std::max(subgroups[rank], nsubGroups);
|
||||
}
|
||||
|
@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||
}
|
||||
if (starts[group] == -1 || ends[group] == -1) {
|
||||
*nringsRet = ring;
|
||||
return ncclSuccess;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
// Link groups together
|
||||
|
@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||
prev[ring*nranks+starts[nextGroup]] = ends[group];
|
||||
}
|
||||
}
|
||||
done:
|
||||
free(cardUsed);
|
||||
free(starts);
|
||||
free(ends);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||
if (args->head < args->end) {
|
||||
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
|
||||
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
|
||||
if (args->llMode) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int size = sizesFifo[buffSlot];
|
||||
if (size != -1) {
|
||||
uint32_t flag = args->tail + 1;
|
||||
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
|
||||
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
|
||||
size = nFifoLines * sizeof(union ncclLLFifoLine);
|
||||
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
|
||||
|
@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if (args->tail < resources->hostRecvMem->tail) {
|
||||
} else if (args->tail < *recvTail) {
|
||||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
int stepSize = args->channel->buffSize/NCCL_STEPS;
|
||||
// Send through network
|
||||
|
@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
|||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpDone;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
if (args->state == ncclProxyOpDone) {
|
||||
union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
|
||||
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
||||
for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
|
||||
resources->step += NCCL_STEPS;
|
||||
resources->hostSendMem->head = resources->step;
|
||||
resources->llLastCleaning = resources->step;
|
||||
}
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
|
||||
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
|
||||
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
volatile uint64_t* sendHead = &resources->hostSendMem->head;
|
||||
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
|
||||
int buffSlot = args->tail%NCCL_STEPS;
|
||||
int sliceSize = stepSize * args->sliceSteps;
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
|
||||
|
@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
|||
if (args->head == args->end) {
|
||||
resources->step = args->end;
|
||||
args->idle = 0;
|
||||
args->state = ncclProxyOpDone;
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
if (args->state == ncclProxyOpDone) {
|
||||
if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
|
||||
resources->step += NCCL_STEPS;
|
||||
while (resources->hostSendMem->head < resources->step);
|
||||
resources->llLastCleaning = resources->step;
|
||||
}
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
|||
}
|
||||
int found = 0;
|
||||
struct ibv_device_attr devAttr;
|
||||
memset(&devAttr, 0, sizeof(devAttr));
|
||||
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
|
||||
WARN("NET/IB : Unable to query device %s", devices[d]->name);
|
||||
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
|
@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
|
|||
/* Determine if we can communicate with the peer through p2p */
|
||||
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
|
||||
// Do not use P2P across root complexes by default (provided CUDA permits it)
|
||||
int p2pLevel = PATH_SOC;
|
||||
int p2pLevel = PATH_NODE;
|
||||
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
|
||||
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
|
||||
|
||||
|
@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
|
||||
if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
|
||||
if (peerCudaDev == -1) {
|
||||
// Peer's CUDA device is not visible in this process
|
||||
#if CUDART_VERSION >= 10010
|
||||
// But in CUDA 10.1 we can still communicate with 'invisible' devices
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
|
||||
// Check for NVLink/NVswitch including P2P access
|
||||
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
|
||||
if (nvlinkp2p > 0) {
|
||||
*ret = nvlinkp2p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
|
||||
|
||||
// Do not detect topology if we're on the same GPU. Note this is not really supported.
|
||||
if (myInfo->cudaDev == peerCudaDev) {
|
||||
*ret = 1 + PATH_SOC;
|
||||
*ret = 1 + PATH_SYS;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||
if (err1 == ncclSuccess && err2 == ncclSuccess) {
|
||||
int distance = pciDistance(myPath, peerPath);
|
||||
if (distance < p2pLevel) {
|
||||
*ret = 1 + PATH_SOC - distance;
|
||||
*ret = 1 + PATH_SYS - distance;
|
||||
}
|
||||
}
|
||||
if (err1 == ncclSuccess) free(myPath);
|
||||
|
@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
|||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
|
||||
#define MAXGPUS_PCI 64
|
||||
|
||||
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
|
||||
int nrings = 0;
|
||||
ncclTvalue_t* line = matrix+current*n;
|
||||
|
@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
|
|||
}
|
||||
}
|
||||
} else {
|
||||
int ringsSave[nRingsMax*n];
|
||||
int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
|
||||
int maxStep = 0;
|
||||
for (int i=0; i<n; i++) {
|
||||
if (inTheRing[i] == 0 && line[i] > 0) {
|
||||
|
@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
|
|||
}
|
||||
|
||||
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
|
||||
for (int score = PATH_SOC+1; score >= minScore; score--) {
|
||||
for (int score = PATH_SYS+1; score >= minScore; score--) {
|
||||
int best = -1;
|
||||
int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
|
||||
int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
|
||||
for (int n = 0; n < nranks; n++) {
|
||||
if (inRing[n]) continue;
|
||||
if (values[rank*nranks+n] == score) {
|
||||
|
@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
|
|||
int start = findConnect(nranks, prev+r*nranks);
|
||||
int end = findConnect(nranks, next+r*nranks);
|
||||
|
||||
int inRing[nranks];
|
||||
int inRing[MAXGPUS_PCI];
|
||||
for (int i=0; i<nranks; i++) inRing[i] = 0;
|
||||
|
||||
if (start == -1 && end == -1) {
|
||||
|
@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
|
|||
links += val/CONNECT_NVLINK;
|
||||
}
|
||||
if (rank == 0) directLinks = links;
|
||||
else directLinks = std::min(directLinks, links);
|
||||
else directLinks = std::min(directLinks, links);
|
||||
}
|
||||
if (directLinks > 0) {
|
||||
// NVLink : Connect rings or create new ones
|
||||
if (nranks > MAXGPUS_NVLINKP2P) {
|
||||
WARN("Recursive P2P computation cannot work for >8 GPUs");
|
||||
return ncclInternalError;
|
||||
}
|
||||
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
|
||||
goto end;
|
||||
}
|
||||
|
@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
|
|||
if (sendRes->ipcPtr)
|
||||
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
|
||||
CUDACHECK(cudaFree(sendRes->devMem));
|
||||
free(sendRes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
|
|||
if (recvRes->ipcPtr)
|
||||
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
|
||||
CUDACHECK(cudaFree(recvRes->devMem));
|
||||
free(recvRes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
|
|||
return -1;
|
||||
}
|
||||
|
||||
#define MAXGROUPS 16
|
||||
|
||||
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
|
||||
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
|
||||
int nGroups = groups[nranks-1] + 1;
|
||||
int starts[nGroups];
|
||||
int ends[nGroups];
|
||||
int starts[MAXGROUPS];
|
||||
int ends[MAXGROUPS];
|
||||
for (int ring = 0; ring<*nringsRet; ring++) {
|
||||
int startGroup = -1, endGroup = -1;
|
||||
for (int group = 0; group<nGroups; group++) {
|
Загрузка…
Ссылка в новой задаче